fixupupstream/users/minglotus-6/spr/main.instrprof

author: mingmingl <mingmingl@google.com> 2024-02-13 11:11:15 -0800
committer: mingmingl <mingmingl@google.com> 2024-02-13 11:11:15 -0800
commit: 5de40c7ae250cf47a36f05d32e014a2bd9adfeef (patch)
tree: 9afc3edb9616399c97721d96c6cab702575a5f35
parent: 3f2da71253c4ea2ca75c4aaf9d659b11aab12c44 (diff)
parent: fc0e9c8315564288f9079a633892abadace534cf (diff)
1894 files changed, 72495 insertions, 38586 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 4ebf304e23d5..c14ec464a43a 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -233,7 +233,10 @@ linux_projects=$(add-dependencies ${linux_projects_to_test} | sort | uniq)
 
 windows_projects_to_test=$(exclude-windows $(compute-projects-to-test ${modified_projects}))
 windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq)
-windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
+# Temporary disable the windows job.
+# See https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840
+#windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
+windows_projects=""
 
 # Generate the appropriate pipeline
 if [[ "${linux_projects}" != "" ]]; then
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index ea84e31a6fa9..1f498a8be943 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -78,3 +78,6 @@ f6d557ee34b6bbdb1dc32f29e34b4a4a8ad35e81
 082b89b25faae3e45a023caf51b65ca0f02f377f
 0ba22f51d128bee9d69756c56c4678097270e10b
 84da0e1bb75f8666cf222d2f600f37bebb9ea389
+
+# [NFC] clang-format utils/TableGen (#80973)
+b9079baaddfed5e604fbfaa1d81a7a1c38e78c26
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 767f58e01a39..3fe0cbbcb84d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -103,4 +103,4 @@
 /mlir/**/*SparseTensor*/ @aartbik @PeimingLiu @yinying-lisa-li @matthias-springer
 
 # BOLT
-/bolt/ @aaupov @maksfb @rafaelauler @dcci
+/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci
diff --git a/.github/workflows/approved-prs.yml b/.github/workflows/approved-prs.yml
new file mode 100644
index 000000000000..309a9217e42d
--- /dev/null
+++ b/.github/workflows/approved-prs.yml
@@ -0,0 +1,39 @@
+name: "Prompt reviewers to merge PRs on behalf of authors"
+
+permissions:
+  contents: read
+
+on:
+  pull_request_review:
+    types:
+      - submitted
+
+jobs:
+  merge-on-behalf-information-comment:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    if: >-
+      (github.repository == 'llvm/llvm-project') &&
+      (github.event.review.state == 'APPROVED')
+    steps:
+      - name: Checkout Automation Script
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: llvm/utils/git/
+          ref: main
+
+      - name: Setup Automation Script
+        working-directory: ./llvm/utils/git/
+        run: |
+          pip install -r requirements.txt
+
+      - name: Add Merge On Behalf Comment
+        working-directory: ./llvm/utils/git/
+        run: |
+          python3 ./github-automation.py \
+            --token '${{ secrets.GITHUB_TOKEN }}' \
+            pr-merge-on-behalf-information \
+            --issue-number "${{ github.event.pull_request.number }}" \
+            --author "${{ github.event.pull_request.user.login }}" \
+            --reviewer "${{ github.event.review.user.login }}"
diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 33a1e89a729f..448c1c56f897 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -65,4 +65,5 @@ jobs:
           release-workflow \
           --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           --issue-number ${{ github.event.issue.number }} \
+          --requested-by ${{ github.event.issue.user.login }} \
           auto
diff --git a/.github/workflows/llvm-project-workflow-tests.yml b/.github/workflows/llvm-project-workflow-tests.yml
new file mode 100644
index 000000000000..a2539b279be0
--- /dev/null
+++ b/.github/workflows/llvm-project-workflow-tests.yml
@@ -0,0 +1,32 @@
+# This workflow will test the llvm-project-tests workflow in PRs
+# targetting the main branch.  Since this workflow doesn't normally
+# run on main PRs, we need some way to test it to ensure new updates
+# don't break it.
+
+name: LLVM Workflow Test
+
+permissions:
+  contents: read
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+    paths:
+      - '.github/workflows/llvm-project-tests.yml'
+      - '.github/workflows/llvm-project-workflow-tests.yml'
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  llvm-test:
+    if: github.repository_owner == 'llvm'
+    name: Build and Test
+    uses: ./.github/workflows/llvm-project-tests.yml
+    with:
+      build_target: check-all
+      projects: clang;lld;libclc;lldb
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index f1db1fbded6a..30336c4e3a74 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -145,6 +145,35 @@ public:
   }
 };
 
+/// BOLT-exclusive errors generated in core BOLT libraries, optionally holding a
+/// string message and whether it is fatal or not. In case it is fatal and if
+/// BOLT is running as a standalone process, the process might be killed as soon
+/// as the error is checked.
+class BOLTError : public ErrorInfo<BOLTError> {
+public:
+  static char ID;
+
+  BOLTError(bool IsFatal, const Twine &S = Twine());
+  void log(raw_ostream &OS) const override;
+  bool isFatal() const { return IsFatal; }
+
+  const std::string &getMessage() const { return Msg; }
+  std::error_code convertToErrorCode() const override;
+
+private:
+  bool IsFatal;
+  std::string Msg;
+};
+
+/// Streams used by BOLT to log regular or error events
+struct JournalingStreams {
+  raw_ostream &Out;
+  raw_ostream &Err;
+};
+
+Error createNonFatalBOLTError(const Twine &S);
+Error createFatalBOLTError(const Twine &S);
+
 class BinaryContext {
   BinaryContext() = delete;
 
@@ -237,7 +266,8 @@ class BinaryContext {
 public:
   static Expected<std::unique_ptr<BinaryContext>>
   createBinaryContext(const ObjectFile *File, bool IsPIC,
-                      std::unique_ptr<DWARFContext> DwCtx);
+                      std::unique_ptr<DWARFContext> DwCtx,
+                      JournalingStreams Logger);
 
   /// Superset of compiler units that will contain overwritten code that needs
   /// new debug info. In a few cases, functions may end up not being
@@ -605,6 +635,10 @@ public:
 
   std::unique_ptr<MCAsmBackend> MAB;
 
+  /// Allows BOLT to print to log whenever it is necessary (with or without
+  /// const references)
+  mutable JournalingStreams Logger;
+
   /// Indicates if the binary is Linux kernel.
   bool IsLinuxKernel{false};
 
@@ -737,7 +771,8 @@ public:
                 std::unique_ptr<const MCInstrAnalysis> MIA,
                 std::unique_ptr<MCPlusBuilder> MIB,
                 std::unique_ptr<const MCRegisterInfo> MRI,
-                std::unique_ptr<MCDisassembler> DisAsm);
+                std::unique_ptr<MCDisassembler> DisAsm,
+                JournalingStreams Logger);
 
   ~BinaryContext();
 
@@ -1349,8 +1384,12 @@ public:
     return Offset;
   }
 
-  void exitWithBugReport(StringRef Message,
-                         const BinaryFunction &Function) const;
+  /// Log BOLT errors to journaling streams and quit process with non-zero error
+  /// code 1 if error is fatal.
+  void logBOLTErrorsAndQuitOnFatal(Error E);
+
+  std::string generateBugReportMessage(StringRef Message,
+                                       const BinaryFunction &Function) const;
 
   struct IndependentCodeEmitter {
     std::unique_ptr<MCObjectFileInfo> LocalMOFI;
@@ -1398,6 +1437,10 @@ public:
     assert(IOAddressMap && "Address map not set yet");
     return *IOAddressMap;
   }
+
+  raw_ostream &outs() const { return Logger.Out; }
+
+  raw_ostream &errs() const { return Logger.Err; }
 };
 
 template <typename T, typename = std::enable_if_t<sizeof(T) == 1>>
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 3a1eae3311bd..a177178769e4 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1910,12 +1910,11 @@ public:
 
   /// Support dynamic relocations in constant islands, which may happen if
   /// binary is linked with -z notext option.
-  void markIslandDynamicRelocationAtAddress(uint64_t Address) {
-    if (!isInConstantIsland(Address)) {
-      errs() << "BOLT-ERROR: dynamic relocation found for text section at 0x"
-             << Twine::utohexstr(Address) << "\n";
-      exit(1);
-    }
+  Error markIslandDynamicRelocationAtAddress(uint64_t Address) {
+    if (!isInConstantIsland(Address))
+      return createFatalBOLTError(
+          Twine("dynamic relocation found for text section at 0x") +
+          Twine::utohexstr(Address) + Twine("\n"));
 
     // Mark island to have dynamic relocation
     Islands->HasDynamicRelocations = true;
@@ -1924,6 +1923,7 @@ public:
     // move binary data during updateOutputValues, making us emit
     // dynamic relocation with the right offset value.
     getOrCreateIslandAccess(Address);
+    return Error::success();
   }
 
   bool hasDynamicRelocationAtIsland() const {
@@ -2054,9 +2054,10 @@ public:
   /// state to State:Disassembled.
   ///
   /// Returns false if disassembly failed.
-  bool disassemble();
+  Error disassemble();
 
-  void handlePCRelOperand(MCInst &Instruction, uint64_t Address, uint64_t Size);
+  Error handlePCRelOperand(MCInst &Instruction, uint64_t Address,
+                           uint64_t Size);
 
   MCSymbol *handleExternalReference(MCInst &Instruction, uint64_t Size,
                                     uint64_t Offset, uint64_t TargetAddress,
@@ -2100,7 +2101,7 @@ public:
   ///
   /// Returns true on success and update the current function state to
   /// State::CFG. Returns false if CFG cannot be built.
-  bool buildCFG(MCPlusBuilder::AllocatorIdTy);
+  Error buildCFG(MCPlusBuilder::AllocatorIdTy);
 
   /// Perform post-processing of the CFG.
   void postProcessCFG();
@@ -2217,7 +2218,7 @@ public:
   }
 
   /// Process LSDA information for the function.
-  void parseLSDA(ArrayRef<uint8_t> LSDAData, uint64_t LSDAAddress);
+  Error parseLSDA(ArrayRef<uint8_t> LSDAData, uint64_t LSDAAddress);
 
   /// Update exception handling ranges for the function.
   void updateEHRanges();
diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h
index 70914f59157d..a85dbf28950e 100644
--- a/bolt/include/bolt/Core/BinarySection.h
+++ b/bolt/include/bolt/Core/BinarySection.h
@@ -112,7 +112,7 @@ class BinarySection {
   static StringRef getName(SectionRef Section) {
     return cantFail(Section.getName());
   }
-  static StringRef getContents(SectionRef Section) {
+  static StringRef getContentsOrQuit(SectionRef Section) {
     if (Section.getObject()->isELF() &&
         ELFSectionRef(Section).getType() == ELF::SHT_NOBITS)
       return StringRef();
@@ -159,7 +159,7 @@ public:
 
   BinarySection(BinaryContext &BC, SectionRef Section)
       : BC(BC), Name(getName(Section)), Section(Section),
-        Contents(getContents(Section)), Address(Section.getAddress()),
+        Contents(getContentsOrQuit(Section)), Address(Section.getAddress()),
         Size(Section.getSize()), Alignment(Section.getAlignment().value()),
         OutputName(Name), SectionNumber(++Count) {
     if (isELF()) {
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index f89084065aae..f13d42ff4ab4 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -15,6 +15,7 @@
 #ifndef BOLT_CORE_DIE_BUILDER_H
 #define BOLT_CORE_DIE_BUILDER_H
 
+#include "bolt/Core/BinaryContext.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
@@ -32,6 +33,7 @@
 namespace llvm {
 
 namespace bolt {
+
 class DIEStreamer;
 class DebugStrOffsetsWriter;
 
@@ -120,6 +122,7 @@ private:
   std::unique_ptr<State> BuilderState;
   FoldingSet<DIEAbbrev> AbbreviationsSet;
   std::vector<std::unique_ptr<DIEAbbrev>> Abbreviations;
+  BinaryContext &BC;
   DWARFContext *DwarfContext{nullptr};
   bool IsDWO{false};
   uint64_t UnitSize{0};
@@ -219,9 +222,10 @@ private:
     if (getState().CloneUnitCtxMap[UnitId].DieInfoVector.size() > DIEId)
       return *getState().CloneUnitCtxMap[UnitId].DieInfoVector[DIEId].get();
 
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: The DIE is not allocated "
-              "before looking up, some"
-           << "unexpected corner cases happened.\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: The DIE is not allocated "
+           "before looking up, some"
+        << "unexpected corner cases happened.\n";
     return *getState().CloneUnitCtxMap[UnitId].DieInfoVector.front().get();
   }
 
@@ -261,7 +265,7 @@ private:
   DIE *constructDIEFast(DWARFDie &DDie, DWARFUnit &U, uint32_t UnitId);
 
 public:
-  DIEBuilder(DWARFContext *DwarfContext, bool IsDWO = false);
+  DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext, bool IsDWO = false);
 
   /// Returns enum to what we are currently processing.
   ProcessingType getCurrentProcessingState() { return getState().Type; }
@@ -295,8 +299,9 @@ public:
     if (getState().TypeDIEMap.count(&DU))
       return getState().TypeDIEMap[&DU];
 
-    errs() << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x"
-           << DU.getOffset() << "\n";
+    BC.errs()
+        << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x"
+        << DU.getOffset() << "\n";
     return nullptr;
   }
 
diff --git a/bolt/include/bolt/Core/DynoStats.h b/bolt/include/bolt/Core/DynoStats.h
index 65256719ba06..82a69668385b 100644
--- a/bolt/include/bolt/Core/DynoStats.h
+++ b/bolt/include/bolt/Core/DynoStats.h
@@ -159,8 +159,9 @@ inline DynoStats getDynoStats(FuncsType &Funcs, bool IsAArch64) {
 
 /// Call a function with optional before and after dynostats printing.
 template <typename FnType, typename FuncsType>
-inline void callWithDynoStats(FnType &&Func, FuncsType &Funcs, StringRef Phase,
-                              const bool Flag, bool IsAArch64) {
+inline void callWithDynoStats(raw_ostream &OS, FnType &&Func, FuncsType &Funcs,
+                              StringRef Phase, const bool Flag,
+                              bool IsAArch64) {
   DynoStats DynoStatsBefore(IsAArch64);
   if (Flag)
     DynoStatsBefore = getDynoStats(Funcs, IsAArch64);
@@ -170,12 +171,12 @@ inline void callWithDynoStats(FnType &&Func, FuncsType &Funcs, StringRef Phase,
   if (Flag) {
     const DynoStats DynoStatsAfter = getDynoStats(Funcs, IsAArch64);
     const bool Changed = (DynoStatsAfter != DynoStatsBefore);
-    outs() << "BOLT-INFO: program-wide dynostats after running " << Phase
-           << (Changed ? "" : " (no change)") << ":\n\n"
-           << DynoStatsBefore << '\n';
+    OS << "BOLT-INFO: program-wide dynostats after running " << Phase
+       << (Changed ? "" : " (no change)") << ":\n\n"
+       << DynoStatsBefore << '\n';
     if (Changed)
-      DynoStatsAfter.print(outs(), &DynoStatsBefore);
-    outs() << '\n';
+      DynoStatsAfter.print(OS, &DynoStatsBefore);
+    OS << '\n';
   }
 }
 
diff --git a/bolt/include/bolt/Core/Exceptions.h b/bolt/include/bolt/Core/Exceptions.h
index 7c09b5b768fe..422b86f6ddb7 100644
--- a/bolt/include/bolt/Core/Exceptions.h
+++ b/bolt/include/bolt/Core/Exceptions.h
@@ -30,13 +30,14 @@ class FDE;
 
 namespace bolt {
 
+class BinaryContext;
 class BinaryFunction;
 
 /// \brief Wraps up information to read all CFI instructions and feed them to a
 /// BinaryFunction, as well as rewriting CFI sections.
 class CFIReaderWriter {
 public:
-  explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame);
+  explicit CFIReaderWriter(BinaryContext &BC, const DWARFDebugFrame &EHFrame);
 
   bool fillCFIInfoFor(BinaryFunction &Function) const;
 
@@ -59,6 +60,7 @@ public:
   const FDEsMap &getFDEs() const { return FDEs; }
 
 private:
+  BinaryContext &BC;
   FDEsMap FDEs;
 };
 
diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/ADRRelaxationPass.h
index a5ff0c4a8003..1d35a335c025 100644
--- a/bolt/include/bolt/Passes/ADRRelaxationPass.h
+++ b/bolt/include/bolt/Passes/ADRRelaxationPass.h
@@ -30,7 +30,7 @@ public:
   const char *getName() const override { return "adr-relaxation"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
   void runOnFunction(BinaryFunction &BF);
 };
 
diff --git a/bolt/include/bolt/Passes/Aligner.h b/bolt/include/bolt/Passes/Aligner.h
index 4cb44fdb121e..eb077182c945 100644
--- a/bolt/include/bolt/Passes/Aligner.h
+++ b/bolt/include/bolt/Passes/Aligner.h
@@ -39,7 +39,7 @@ public:
   const char *getName() const override { return "aligner"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/AllocCombiner.h b/bolt/include/bolt/Passes/AllocCombiner.h
index 44c6fddd34de..8532f761c9ad 100644
--- a/bolt/include/bolt/Passes/AllocCombiner.h
+++ b/bolt/include/bolt/Passes/AllocCombiner.h
@@ -33,7 +33,7 @@ public:
   }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/AsmDump.h b/bolt/include/bolt/Passes/AsmDump.h
index 7cc96f20a920..d993909f2794 100644
--- a/bolt/include/bolt/Passes/AsmDump.h
+++ b/bolt/include/bolt/Passes/AsmDump.h
@@ -28,7 +28,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index dace07e903e7..046765b16f19 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -50,8 +50,7 @@ public:
   /// this pass is completed (printPass() must have returned true).
   virtual bool shouldPrint(const BinaryFunction &BF) const;
 
-  /// Execute this pass on the given functions.
-  virtual void runOnFunctions(BinaryContext &BC) = 0;
+  virtual Error runOnFunctions(BinaryContext &BC) = 0;
 };
 
 /// A pass to print program-wide dynostats.
@@ -70,18 +69,19 @@ public:
 
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
-  void runOnFunctions(BinaryContext &BC) override {
+  Error runOnFunctions(BinaryContext &BC) override {
     const DynoStats NewDynoStats =
         getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
     const bool Changed = (NewDynoStats != PrevDynoStats);
-    outs() << "BOLT-INFO: program-wide dynostats " << Title
-           << (Changed ? "" : " (no change)") << ":\n\n"
-           << PrevDynoStats;
+    BC.outs() << "BOLT-INFO: program-wide dynostats " << Title
+              << (Changed ? "" : " (no change)") << ":\n\n"
+              << PrevDynoStats;
     if (Changed) {
-      outs() << '\n';
-      NewDynoStats.print(outs(), &PrevDynoStats, BC.InstPrinter.get());
+      BC.outs() << '\n';
+      NewDynoStats.print(BC.outs(), &PrevDynoStats, BC.InstPrinter.get());
     }
-    outs() << '\n';
+    BC.outs() << '\n';
+    return Error::success();
   }
 };
 
@@ -100,7 +100,7 @@ public:
 
   const char *getName() const override { return "normalize CFG"; }
 
-  void runOnFunctions(BinaryContext &) override;
+  Error runOnFunctions(BinaryContext &) override;
 };
 
 /// Detect and eliminate unreachable basic blocks. We could have those
@@ -119,7 +119,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &) override;
+  Error runOnFunctions(BinaryContext &) override;
 };
 
 // Reorder the basic blocks for each function based on hotness.
@@ -165,7 +165,7 @@ public:
 
   const char *getName() const override { return "reorder-blocks"; }
   bool shouldPrint(const BinaryFunction &BF) const override;
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Sync local branches with CFG.
@@ -175,7 +175,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "fix-branches"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Fix the CFI state and exception handling information after all other
@@ -186,7 +186,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "finalize-functions"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Perform any necessary adjustments for functions that do not fit into their
@@ -198,7 +198,7 @@ public:
 
   const char *getName() const override { return "check-large-functions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   bool shouldOptimize(const BinaryFunction &BF) const override;
 };
@@ -210,7 +210,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "lower-annotations"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Clean the state of the MC representation before sending it to emission
@@ -220,7 +220,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "clean-mc-state"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// An optimization to simplify conditional tail calls by removing
@@ -292,7 +292,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Convert instructions to the form with the minimum operand width.
@@ -305,7 +305,7 @@ public:
 
   const char *getName() const override { return "shorten-instructions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Perform simple peephole optimizations.
@@ -339,7 +339,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "peepholes"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// An optimization to simplify loads from read-only sections.The pass converts
@@ -370,7 +370,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Assign output sections to all functions.
@@ -379,7 +379,7 @@ public:
   explicit AssignSections() : BinaryFunctionPass(false) {}
 
   const char *getName() const override { return "assign-sections"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Compute and report to the user the imbalance in flow equations for all
@@ -394,7 +394,7 @@ public:
 
   const char *getName() const override { return "profile-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Prints a list of the top 100 functions sorted by a set of
@@ -406,7 +406,7 @@ public:
 
   const char *getName() const override { return "print-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for lowering any instructions that we have raised and that have
@@ -418,7 +418,7 @@ public:
 
   const char *getName() const override { return "inst-lowering"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for stripping 'repz' from 'repz retq' sequence of instructions.
@@ -429,7 +429,7 @@ public:
 
   const char *getName() const override { return "strip-rep-ret"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for inlining calls to memcpy using 'rep movsb' on X86.
@@ -440,7 +440,7 @@ public:
 
   const char *getName() const override { return "inline-memcpy"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass for specializing memcpy for a size of 1 byte.
@@ -461,7 +461,7 @@ public:
 
   const char *getName() const override { return "specialize-memcpy"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 /// Pass to remove nops in code
@@ -475,7 +475,7 @@ public:
   const char *getName() const override { return "remove-nops"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 enum FrameOptimizationType : char {
diff --git a/bolt/include/bolt/Passes/CMOVConversion.h b/bolt/include/bolt/Passes/CMOVConversion.h
index 77ce2235001a..4046266fa39d 100644
--- a/bolt/include/bolt/Passes/CMOVConversion.h
+++ b/bolt/include/bolt/Passes/CMOVConversion.h
@@ -64,7 +64,7 @@ class CMOVConversion : public BinaryFunctionPass {
     }
     double getMPRatio() { return (double)RemovedMP / PossibleMP; }
 
-    void dump();
+    void dumpTo(raw_ostream &OS);
   };
   // BinaryContext-wide stats
   Stats Global;
@@ -76,7 +76,7 @@ public:
 
   const char *getName() const override { return "CMOV conversion"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/CacheMetrics.h b/bolt/include/bolt/Passes/CacheMetrics.h
index 1e40ad66d933..5c88d98c76c1 100644
--- a/bolt/include/bolt/Passes/CacheMetrics.h
+++ b/bolt/include/bolt/Passes/CacheMetrics.h
@@ -17,12 +17,16 @@
 #include <vector>
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace bolt {
 class BinaryFunction;
 namespace CacheMetrics {
 
 /// Calculate and print various metrics related to instruction cache performance
-void printAll(const std::vector<BinaryFunction *> &BinaryFunctions);
+void printAll(raw_ostream &OS,
+              const std::vector<BinaryFunction *> &BinaryFunctions);
 
 } // namespace CacheMetrics
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FixRISCVCallsPass.h b/bolt/include/bolt/Passes/FixRISCVCallsPass.h
index 46418c43d192..a5c3e5158d6d 100644
--- a/bolt/include/bolt/Passes/FixRISCVCallsPass.h
+++ b/bolt/include/bolt/Passes/FixRISCVCallsPass.h
@@ -33,7 +33,7 @@ public:
   const char *getName() const override { return "fix-riscv-calls"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h
index 45ee9cb73603..50b64480aa62 100644
--- a/bolt/include/bolt/Passes/FixRelaxationPass.h
+++ b/bolt/include/bolt/Passes/FixRelaxationPass.h
@@ -31,7 +31,7 @@ public:
   const char *getName() const override { return "fix-relaxations"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/FrameOptimizer.h b/bolt/include/bolt/Passes/FrameOptimizer.h
index 310bebfee266..64055bd25172 100644
--- a/bolt/include/bolt/Passes/FrameOptimizer.h
+++ b/bolt/include/bolt/Passes/FrameOptimizer.h
@@ -98,8 +98,8 @@ class FrameOptimizerPass : public BinaryFunctionPass {
   void removeUnusedStores(const FrameAnalysis &FA, BinaryFunction &BF);
 
   /// Perform shrinkwrapping step
-  void performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
-                             BinaryContext &BC);
+  Error performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA,
+                              BinaryContext &BC);
 
 public:
   explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
@@ -108,7 +108,7 @@ public:
   const char *getName() const override { return "frame-optimizer"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
diff --git a/bolt/include/bolt/Passes/Hugify.h b/bolt/include/bolt/Passes/Hugify.h
index 0a7734059121..52c0ae19102b 100644
--- a/bolt/include/bolt/Passes/Hugify.h
+++ b/bolt/include/bolt/Passes/Hugify.h
@@ -18,7 +18,7 @@ class HugePage : public BinaryFunctionPass {
 public:
   HugePage(const cl::opt<bool> &PrintPass) : BinaryFunctionPass(PrintPass) {}
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   const char *getName() const override { return "HugePage"; }
 };
diff --git a/bolt/include/bolt/Passes/IdenticalCodeFolding.h b/bolt/include/bolt/Passes/IdenticalCodeFolding.h
index c15cebc8af19..b4206fa36074 100644
--- a/bolt/include/bolt/Passes/IdenticalCodeFolding.h
+++ b/bolt/include/bolt/Passes/IdenticalCodeFolding.h
@@ -35,7 +35,7 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "identical-code-folding"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/IndirectCallPromotion.h b/bolt/include/bolt/Passes/IndirectCallPromotion.h
index 397a38663948..adc58d70ec0f 100644
--- a/bolt/include/bolt/Passes/IndirectCallPromotion.h
+++ b/bolt/include/bolt/Passes/IndirectCallPromotion.h
@@ -221,7 +221,7 @@ public:
     return BF.isSimple() && !BF.isIgnored() && BF.hasProfile() &&
            !BF.hasUnknownControlFlow();
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/Inliner.h b/bolt/include/bolt/Passes/Inliner.h
index 711eae69d1c9..5d9b96a2d915 100644
--- a/bolt/include/bolt/Passes/Inliner.h
+++ b/bolt/include/bolt/Passes/Inliner.h
@@ -86,7 +86,7 @@ public:
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/Instrumentation.h b/bolt/include/bolt/Passes/Instrumentation.h
index 1a11f9eab511..76ffcf41db6e 100644
--- a/bolt/include/bolt/Passes/Instrumentation.h
+++ b/bolt/include/bolt/Passes/Instrumentation.h
@@ -31,7 +31,7 @@ public:
         Summary(std::make_unique<InstrumentationSummary>()) {}
 
   /// Modifies all functions by inserting instrumentation code (first step)
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
   const char *getName() const override { return "instrumentation"; }
 
diff --git a/bolt/include/bolt/Passes/JTFootprintReduction.h b/bolt/include/bolt/Passes/JTFootprintReduction.h
index 084049d3b463..4b015e1f96b3 100644
--- a/bolt/include/bolt/Passes/JTFootprintReduction.h
+++ b/bolt/include/bolt/Passes/JTFootprintReduction.h
@@ -68,7 +68,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0;
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index c95181922dbc..3d02d75ac4a2 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -131,14 +131,14 @@ class LongJmpPass : public BinaryFunctionPass {
                  uint64_t DotAddress) const;
 
   /// Expand the range of the stub in StubBB if necessary
-  bool relaxStub(BinaryBasicBlock &StubBB);
+  Error relaxStub(BinaryBasicBlock &StubBB, bool &Modified);
 
   /// Helper to resolve a symbol address according to our tentative layout
   uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target,
                             const BinaryBasicBlock *TgtBB) const;
 
   /// Relax function by adding necessary stubs or relaxing existing stubs
-  bool relax(BinaryFunction &BF);
+  Error relax(BinaryFunction &BF, bool &Modified);
 
 public:
   /// BinaryPass public interface
@@ -148,7 +148,7 @@ public:
 
   const char *getName() const override { return "long-jmp"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Passes/LoopInversionPass.h b/bolt/include/bolt/Passes/LoopInversionPass.h
index 472fb36640c1..aee441d72093 100644
--- a/bolt/include/bolt/Passes/LoopInversionPass.h
+++ b/bolt/include/bolt/Passes/LoopInversionPass.h
@@ -49,7 +49,7 @@ public:
   const char *getName() const override { return "loop-inversion-opt"; }
 
   /// Pass entry point
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
   bool runOnFunction(BinaryFunction &Function);
 };
 
diff --git a/bolt/include/bolt/Passes/PLTCall.h b/bolt/include/bolt/Passes/PLTCall.h
index 4fdbf60c7f9f..09ef96e27293 100644
--- a/bolt/include/bolt/Passes/PLTCall.h
+++ b/bolt/include/bolt/Passes/PLTCall.h
@@ -30,7 +30,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF);
   }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/PatchEntries.h b/bolt/include/bolt/Passes/PatchEntries.h
index b9ed4a5e4280..fa6b5811a4c3 100644
--- a/bolt/include/bolt/Passes/PatchEntries.h
+++ b/bolt/include/bolt/Passes/PatchEntries.h
@@ -34,7 +34,7 @@ public:
   explicit PatchEntries() : BinaryFunctionPass(false) {}
 
   const char *getName() const override { return "patch-entries"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/RegReAssign.h b/bolt/include/bolt/Passes/RegReAssign.h
index cd7bea6a62c1..c50e32ff46e2 100644
--- a/bolt/include/bolt/Passes/RegReAssign.h
+++ b/bolt/include/bolt/Passes/RegReAssign.h
@@ -55,7 +55,7 @@ public:
     return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0;
   }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Passes/ReorderData.h b/bolt/include/bolt/Passes/ReorderData.h
index 65b730652101..9cd17aecfd81 100644
--- a/bolt/include/bolt/Passes/ReorderData.h
+++ b/bolt/include/bolt/Passes/ReorderData.h
@@ -35,7 +35,8 @@ private:
   sortedByFunc(BinaryContext &BC, const BinarySection &Section,
                std::map<uint64_t, BinaryFunction> &BFs) const;
 
-  void printOrder(const BinarySection &Section, DataOrder::const_iterator Begin,
+  void printOrder(BinaryContext &BC, const BinarySection &Section,
+                  DataOrder::const_iterator Begin,
                   DataOrder::const_iterator End) const;
 
   /// Set the ordering of the section with \p SectionName.  \p NewOrder is a
@@ -51,7 +52,7 @@ public:
 
   const char *getName() const override { return "reorder-data"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 8f9507d0a824..4c88142c5887 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -20,10 +20,10 @@ class Cluster;
 class ReorderFunctions : public BinaryFunctionPass {
   BinaryFunctionCallGraph Cg;
 
-  void reorder(std::vector<Cluster> &&Clusters,
+  void reorder(BinaryContext &BC, std::vector<Cluster> &&Clusters,
                std::map<uint64_t, BinaryFunction> &BFs);
 
-  void printStats(const std::vector<Cluster> &Clusters,
+  void printStats(BinaryContext &BC, const std::vector<Cluster> &Clusters,
                   const std::vector<uint64_t> &FuncAddr);
 
 public:
@@ -42,9 +42,9 @@ public:
       : BinaryFunctionPass(PrintPass) {}
 
   const char *getName() const override { return "reorder-functions"; }
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
-  static std::vector<std::string> readFunctionOrderFile();
+  static Error readFunctionOrderFile(std::vector<std::string> &FunctionNames);
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/RetpolineInsertion.h b/bolt/include/bolt/Passes/RetpolineInsertion.h
index 12f46a95264c..2cdde7f07483 100644
--- a/bolt/include/bolt/Passes/RetpolineInsertion.h
+++ b/bolt/include/bolt/Passes/RetpolineInsertion.h
@@ -62,7 +62,7 @@ public:
 
   const char *getName() const override { return "retpoline-insertion"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h
index cccbc518dd26..b7809f0bb069 100644
--- a/bolt/include/bolt/Passes/ShrinkWrapping.h
+++ b/bolt/include/bolt/Passes/ShrinkWrapping.h
@@ -467,8 +467,9 @@ private:
   /// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP
   /// values, as determined by StackPointerTracking, should be informed via
   /// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP.
-  MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE,
-                           bool CreatePushOrPop);
+  Expected<MCInst> createStackAccess(int SPVal, int FPVal,
+                                     const FrameIndexEntry &FIE,
+                                     bool CreatePushOrPop);
 
   /// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has
   /// an offset.
@@ -484,22 +485,23 @@ private:
   /// InsertionPoint for other instructions that need to be inserted at the same
   /// original location, since this insertion may have invalidated the previous
   /// location.
-  BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
-                            const WorklistItem &Item, int64_t SPVal,
-                            int64_t FPVal);
+  Expected<BBIterTy> processInsertion(BBIterTy InsertionPoint,
+                                      BinaryBasicBlock *CurBB,
+                                      const WorklistItem &Item, int64_t SPVal,
+                                      int64_t FPVal);
 
   /// Auxiliary function to processInsertions(), helping perform all the
   /// insertion tasks in the todo list associated with a single insertion point.
   /// Return true if at least one insertion was performed.
-  BBIterTy processInsertionsList(BBIterTy InsertionPoint,
-                                 BinaryBasicBlock *CurBB,
-                                 std::vector<WorklistItem> &TodoList,
-                                 int64_t SPVal, int64_t FPVal);
+  Expected<BBIterTy> processInsertionsList(BBIterTy InsertionPoint,
+                                           BinaryBasicBlock *CurBB,
+                                           std::vector<WorklistItem> &TodoList,
+                                           int64_t SPVal, int64_t FPVal);
 
   /// Apply all insertion todo tasks regarding insertion of new stores/loads or
   /// push/pops at annotated points. Return false if the entire function had
   /// no todo tasks annotation and this pass has nothing to do.
-  bool processInsertions();
+  Expected<bool> processInsertions();
 
   /// Apply all deletion todo tasks (or tasks to change a push/pop to a memory
   /// access no-op)
@@ -519,9 +521,9 @@ public:
         BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
   }
 
-  bool perform(bool HotOnly = false);
+  Expected<bool> perform(bool HotOnly = false);
 
-  static void printStats();
+  static void printStats(BinaryContext &BC);
 };
 
 } // end namespace bolt
diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 28e9e79d1b8f..8bdc48b68eb7 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -104,7 +104,7 @@ public:
 
   const char *getName() const override { return "split-functions"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/StokeInfo.h b/bolt/include/bolt/Passes/StokeInfo.h
index 75cfa1e7de43..76417e6a2c3b 100644
--- a/bolt/include/bolt/Passes/StokeInfo.h
+++ b/bolt/include/bolt/Passes/StokeInfo.h
@@ -120,7 +120,7 @@ public:
   bool checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
                      RegAnalysis &RA, StokeFuncInfo &FuncInfo);
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/TailDuplication.h b/bolt/include/bolt/Passes/TailDuplication.h
index b3f1d7b7d964..a2fcab0720ca 100644
--- a/bolt/include/bolt/Passes/TailDuplication.h
+++ b/bolt/include/bolt/Passes/TailDuplication.h
@@ -145,7 +145,7 @@ public:
 
   const char *getName() const override { return "tail duplication"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ThreeWayBranch.h b/bolt/include/bolt/Passes/ThreeWayBranch.h
index 3eabf1b27e9d..9abf4c34134f 100644
--- a/bolt/include/bolt/Passes/ThreeWayBranch.h
+++ b/bolt/include/bolt/Passes/ThreeWayBranch.h
@@ -32,7 +32,7 @@ public:
 
   const char *getName() const override { return "three way branch"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Passes/ValidateInternalCalls.h b/bolt/include/bolt/Passes/ValidateInternalCalls.h
index 137b83b0179f..0cdb8584f92d 100644
--- a/bolt/include/bolt/Passes/ValidateInternalCalls.h
+++ b/bolt/include/bolt/Passes/ValidateInternalCalls.h
@@ -54,7 +54,7 @@ public:
 
   const char *getName() const override { return "validate-internal-calls"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
 private:
   /// Fix the CFG to take into consideration internal calls that do not
diff --git a/bolt/include/bolt/Passes/ValidateMemRefs.h b/bolt/include/bolt/Passes/ValidateMemRefs.h
index d33862cf7b16..90acce370249 100644
--- a/bolt/include/bolt/Passes/ValidateMemRefs.h
+++ b/bolt/include/bolt/Passes/ValidateMemRefs.h
@@ -25,7 +25,7 @@ public:
 
   const char *getName() const override { return "validate-mem-refs"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 
 private:
   bool checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst,
diff --git a/bolt/include/bolt/Passes/VeneerElimination.h b/bolt/include/bolt/Passes/VeneerElimination.h
index 10c849674ab6..9ba10408a4c8 100644
--- a/bolt/include/bolt/Passes/VeneerElimination.h
+++ b/bolt/include/bolt/Passes/VeneerElimination.h
@@ -22,7 +22,7 @@ public:
 
   const char *getName() const override { return "veneer-elimination"; }
 
-  void runOnFunctions(BinaryContext &BC) override;
+  Error runOnFunctions(BinaryContext &BC) override;
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index fa29ece3287a..b9d1e865646b 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -85,7 +85,7 @@ public:
 
   /// Read the serialized address translation tables and load them internally
   /// in memory. Return a parse error if failed.
-  std::error_code parse(StringRef Buf);
+  std::error_code parse(raw_ostream &OS, StringRef Buf);
 
   /// Dump the parsed address translation tables
   void dump(raw_ostream &OS);
diff --git a/bolt/include/bolt/Rewrite/BinaryPassManager.h b/bolt/include/bolt/Rewrite/BinaryPassManager.h
index 84ab192c415a..2297c3bf52fd 100644
--- a/bolt/include/bolt/Rewrite/BinaryPassManager.h
+++ b/bolt/include/bolt/Rewrite/BinaryPassManager.h
@@ -46,10 +46,10 @@ public:
   }
 
   /// Run all registered passes in the order they were added.
-  void runPasses();
+  Error runPasses();
 
   /// Runs all enabled implemented passes on all functions.
-  static void runAllPasses(BinaryContext &BC);
+  static Error runAllPasses(BinaryContext &BC);
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 170da78846b8..97ab65cd5a4a 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -47,11 +47,14 @@ public:
   // construction. Constructors can’t return errors, so clients must test \p Err
   // after the object is constructed. Use `create` method instead.
   RewriteInstance(llvm::object::ELFObjectFileBase *File, const int Argc,
-                  const char *const *Argv, StringRef ToolPath, Error &Err);
+                  const char *const *Argv, StringRef ToolPath,
+                  raw_ostream &Stdout, raw_ostream &Stderr, Error &Err);
 
   static Expected<std::unique_ptr<RewriteInstance>>
   create(llvm::object::ELFObjectFileBase *File, const int Argc,
-         const char *const *Argv, StringRef ToolPath);
+         const char *const *Argv, StringRef ToolPath,
+         raw_ostream &Stdout = llvm::outs(),
+         raw_ostream &Stderr = llvm::errs());
   ~RewriteInstance();
 
   /// Assign profile from \p Filename to this instance.
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 984bc6dbd220..4a83fece0e43 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -92,8 +92,8 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
         // Work on the assumption that jump table blocks don't
         // have a conditional successor.
         Valid = false;
-        errs() << "BOLT-WARNING: Jump table successor " << Succ->getName()
-               << " not contained in the jump table.\n";
+        BC.errs() << "BOLT-WARNING: Jump table successor " << Succ->getName()
+                  << " not contained in the jump table.\n";
       }
     }
     // If there are any leftover entries in the jump table, they
@@ -103,8 +103,8 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
         Valid &= (Sym == Function->getFunctionEndLabel() ||
                   Sym == Function->getFunctionEndLabel(getFragmentNum()));
         if (!Valid) {
-          errs() << "BOLT-WARNING: Jump table contains illegal entry: "
-                 << Sym->getName() << "\n";
+          BC.errs() << "BOLT-WARNING: Jump table contains illegal entry: "
+                    << Sym->getName() << "\n";
         }
       }
     }
@@ -141,11 +141,11 @@ bool BinaryBasicBlock::validateSuccessorInvariants() {
     }
   }
   if (!Valid) {
-    errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ "
-           << getName() << "\n";
+    BC.errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ "
+              << getName() << "\n";
     if (JT) {
-      errs() << "Jump Table instruction addr = 0x"
-             << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n";
+      BC.errs() << "Jump Table instruction addr = 0x"
+                << Twine::utohexstr(BC.MIB->getJumpTable(*Inst)) << "\n";
       JT->print(errs());
     }
     getFunction()->dump();
@@ -520,9 +520,9 @@ uint32_t BinaryBasicBlock::getNumPseudos() const {
       ++N;
 
   if (N != NumPseudos) {
-    errs() << "BOLT-ERROR: instructions for basic block " << getName()
-           << " in function " << *Function << ": calculated pseudos " << N
-           << ", set pseudos " << NumPseudos << ", size " << size() << '\n';
+    BC.errs() << "BOLT-ERROR: instructions for basic block " << getName()
+              << " in function " << *Function << ": calculated pseudos " << N
+              << ", set pseudos " << NumPseudos << ", size " << size() << '\n';
     llvm_unreachable("pseudos mismatch");
   }
 #endif
@@ -559,18 +559,18 @@ BinaryBasicBlock::getBranchStats(const BinaryBasicBlock *Succ) const {
 void BinaryBasicBlock::dump() const {
   BinaryContext &BC = Function->getBinaryContext();
   if (Label)
-    outs() << Label->getName() << ":\n";
-  BC.printInstructions(outs(), Instructions.begin(), Instructions.end(),
+    BC.outs() << Label->getName() << ":\n";
+  BC.printInstructions(BC.outs(), Instructions.begin(), Instructions.end(),
                        getOffset(), Function);
-  outs() << "preds:";
+  BC.outs() << "preds:";
   for (auto itr = pred_begin(); itr != pred_end(); ++itr) {
-    outs() << " " << (*itr)->getName();
+    BC.outs() << " " << (*itr)->getName();
   }
-  outs() << "\nsuccs:";
+  BC.outs() << "\nsuccs:";
   for (auto itr = succ_begin(); itr != succ_end(); ++itr) {
-    outs() << " " << (*itr)->getName();
+    BC.outs() << " " << (*itr)->getName();
   }
-  outs() << "\n";
+  BC.outs() << "\n";
 }
 
 uint64_t BinaryBasicBlock::estimateSize(const MCCodeEmitter *Emitter) const {
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index df835d287680..d544ece13a83 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -83,6 +83,46 @@ cl::opt<std::string> CompDirOverride(
 namespace llvm {
 namespace bolt {
 
+char BOLTError::ID = 0;
+
+BOLTError::BOLTError(bool IsFatal, const Twine &S)
+    : IsFatal(IsFatal), Msg(S.str()) {}
+
+void BOLTError::log(raw_ostream &OS) const {
+  if (IsFatal)
+    OS << "FATAL ";
+  StringRef ErrMsg = StringRef(Msg);
+  // Prepend our error prefix if it is missing
+  if (ErrMsg.empty()) {
+    OS << "BOLT-ERROR\n";
+  } else {
+    if (!ErrMsg.starts_with("BOLT-ERROR"))
+      OS << "BOLT-ERROR: ";
+    OS << ErrMsg << "\n";
+  }
+}
+
+std::error_code BOLTError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
+
+Error createNonFatalBOLTError(const Twine &S) {
+  return make_error<BOLTError>(/*IsFatal*/ false, S);
+}
+
+Error createFatalBOLTError(const Twine &S) {
+  return make_error<BOLTError>(/*IsFatal*/ true, S);
+}
+
+void BinaryContext::logBOLTErrorsAndQuitOnFatal(Error E) {
+  handleAllErrors(Error(std::move(E)), [&](const BOLTError &E) {
+    if (!E.getMessage().empty())
+      E.log(this->errs());
+    if (E.isFatal())
+      exit(1);
+  });
+}
+
 BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
                              std::unique_ptr<DWARFContext> DwCtx,
                              std::unique_ptr<Triple> TheTriple,
@@ -96,13 +136,15 @@ BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
                              std::unique_ptr<const MCInstrAnalysis> MIA,
                              std::unique_ptr<MCPlusBuilder> MIB,
                              std::unique_ptr<const MCRegisterInfo> MRI,
-                             std::unique_ptr<MCDisassembler> DisAsm)
+                             std::unique_ptr<MCDisassembler> DisAsm,
+                             JournalingStreams Logger)
     : Ctx(std::move(Ctx)), DwCtx(std::move(DwCtx)),
       TheTriple(std::move(TheTriple)), TheTarget(TheTarget),
       TripleName(TripleName), MCE(std::move(MCE)), MOFI(std::move(MOFI)),
       AsmInfo(std::move(AsmInfo)), MII(std::move(MII)), STI(std::move(STI)),
       InstPrinter(std::move(InstPrinter)), MIA(std::move(MIA)),
-      MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)) {
+      MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)),
+      Logger(Logger) {
   Relocation::Arch = this->TheTriple->getArch();
   RegularPageSize = isAArch64() ? RegularPageSizeAArch64 : RegularPageSizeX86;
   PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize;
@@ -122,7 +164,8 @@ BinaryContext::~BinaryContext() {
 /// triple \p TripleName.
 Expected<std::unique_ptr<BinaryContext>>
 BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
-                                   std::unique_ptr<DWARFContext> DwCtx) {
+                                   std::unique_ptr<DWARFContext> DwCtx,
+                                   JournalingStreams Logger) {
   StringRef ArchName = "";
   std::string FeaturesStr = "";
   switch (File->getArch()) {
@@ -241,17 +284,12 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
   std::unique_ptr<MCCodeEmitter> MCE(
       TheTarget->createMCCodeEmitter(*MII, *Ctx));
 
-  // Make sure we don't miss any output on core dumps.
-  outs().SetUnbuffered();
-  errs().SetUnbuffered();
-  dbgs().SetUnbuffered();
-
   auto BC = std::make_unique<BinaryContext>(
       std::move(Ctx), std::move(DwCtx), std::move(TheTriple), TheTarget,
       std::string(TripleName), std::move(MCE), std::move(MOFI),
       std::move(AsmInfo), std::move(MII), std::move(STI),
       std::move(InstructionPrinter), std::move(MIA), nullptr, std::move(MRI),
-      std::move(DisAsm));
+      std::move(DisAsm), Logger);
 
   BC->LSDAEncoding = LSDAEncoding;
 
@@ -304,9 +342,9 @@ bool BinaryContext::validateObjectNesting() const {
            Itr->second->containsRange(Next->second->getAddress(),
                                       Next->second->getSize())) {
       if (Next->second->Parent != Itr->second) {
-        errs() << "BOLT-WARNING: object nesting incorrect for:\n"
-               << "BOLT-WARNING:  " << *Itr->second << "\n"
-               << "BOLT-WARNING:  " << *Next->second << "\n";
+        this->errs() << "BOLT-WARNING: object nesting incorrect for:\n"
+                     << "BOLT-WARNING:  " << *Itr->second << "\n"
+                     << "BOLT-WARNING:  " << *Next->second << "\n";
         Valid = false;
       }
       ++Next;
@@ -323,14 +361,16 @@ bool BinaryContext::validateHoles() const {
       uint64_t RelAddr = Rel.Offset + Section.getAddress();
       const BinaryData *BD = getBinaryDataContainingAddress(RelAddr);
       if (!BD) {
-        errs() << "BOLT-WARNING: no BinaryData found for relocation at address"
-               << " 0x" << Twine::utohexstr(RelAddr) << " in "
-               << Section.getName() << "\n";
+        this->errs()
+            << "BOLT-WARNING: no BinaryData found for relocation at address"
+            << " 0x" << Twine::utohexstr(RelAddr) << " in " << Section.getName()
+            << "\n";
         Valid = false;
       } else if (!BD->getAtomicRoot()) {
-        errs() << "BOLT-WARNING: no atomic BinaryData found for relocation at "
-               << "address 0x" << Twine::utohexstr(RelAddr) << " in "
-               << Section.getName() << "\n";
+        this->errs()
+            << "BOLT-WARNING: no atomic BinaryData found for relocation at "
+            << "address 0x" << Twine::utohexstr(RelAddr) << " in "
+            << Section.getName() << "\n";
         Valid = false;
       }
     }
@@ -438,8 +478,9 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
         // The address could potentially escape. Mark it as another entry
         // point into the function.
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: potentially escaped address 0x"
-                 << Twine::utohexstr(Address) << " in function " << BF << '\n';
+          this->outs() << "BOLT-INFO: potentially escaped address 0x"
+                       << Twine::utohexstr(Address) << " in function " << BF
+                       << '\n';
         }
         BF.HasInternalLabelReference = true;
         return std::make_pair(
@@ -482,9 +523,9 @@ MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address,
     // internal function addresses to escape the function scope - we
     // consider it a tail call.
     if (opts::Verbosity > 1) {
-      errs() << "BOLT-WARNING: no section for address 0x"
-             << Twine::utohexstr(Address) << " referenced from function " << BF
-             << '\n';
+      this->errs() << "BOLT-WARNING: no section for address 0x"
+                   << Twine::utohexstr(Address) << " referenced from function "
+                   << BF << '\n';
     }
     return MemoryContentsType::UNKNOWN;
   }
@@ -730,7 +771,7 @@ void BinaryContext::skipMarkedFragments() {
     assert(FragmentsToSkip.count(BF) &&
            "internal error in traversing function fragments");
     if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: Ignoring " << BF->getPrintName() << '\n';
+      this->errs() << "BOLT-WARNING: Ignoring " << BF->getPrintName() << '\n';
     BF->setSimple(false);
     BF->setHasIndirectTargetToSplitFragment(true);
 
@@ -738,9 +779,9 @@ void BinaryContext::skipMarkedFragments() {
     llvm::for_each(BF->ParentFragments, addToWorklist);
   }
   if (!FragmentsToSkip.empty())
-    errs() << "BOLT-WARNING: skipped " << FragmentsToSkip.size() << " function"
-           << (FragmentsToSkip.size() == 1 ? "" : "s")
-           << " due to cold fragments\n";
+    this->errs() << "BOLT-WARNING: skipped " << FragmentsToSkip.size()
+                 << " function" << (FragmentsToSkip.size() == 1 ? "" : "s")
+                 << " due to cold fragments\n";
 }
 
 MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix,
@@ -791,10 +832,10 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
       // Duplicate the entry for the parent function for easy access
       JT->Parents.push_back(&Function);
       if (opts::Verbosity > 2) {
-        outs() << "BOLT-INFO: Multiple fragments access same jump table: "
-               << JT->Parents[0]->getPrintName() << "; "
-               << Function.getPrintName() << "\n";
-        JT->print(outs());
+        this->outs() << "BOLT-INFO: Multiple fragments access same jump table: "
+                     << JT->Parents[0]->getPrintName() << "; "
+                     << Function.getPrintName() << "\n";
+        JT->print(this->outs());
       }
       Function.JumpTables.emplace(Address, JT);
       JT->Parents[0]->setHasIndirectTargetToSplitFragment(true);
@@ -832,7 +873,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
                                 *getSectionForAddress(Address));
   JT->Parents.push_back(&Function);
   if (opts::Verbosity > 2)
-    JT->print(outs());
+    JT->print(this->outs());
   JumpTables.emplace(Address, JT);
 
   // Duplicate the entry for the parent function for easy access.
@@ -961,12 +1002,13 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
     return true;
 
   if (opts::Verbosity >= 1) {
-    errs() << "BOLT-WARNING: bad padding at address 0x"
-           << Twine::utohexstr(BF.getAddress() + BF.getSize())
-           << " starting at offset " << (Offset - BF.getSize())
-           << " in function " << BF << '\n'
-           << FunctionData->slice(BF.getSize(), BF.getMaxSize() - BF.getSize())
-           << '\n';
+    this->errs() << "BOLT-WARNING: bad padding at address 0x"
+                 << Twine::utohexstr(BF.getAddress() + BF.getSize())
+                 << " starting at offset " << (Offset - BF.getSize())
+                 << " in function " << BF << '\n'
+                 << FunctionData->slice(BF.getSize(),
+                                        BF.getMaxSize() - BF.getSize())
+                 << '\n';
   }
 
   return false;
@@ -981,8 +1023,8 @@ void BinaryContext::adjustCodePadding() {
     if (!hasValidCodePadding(BF)) {
       if (HasRelocations) {
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: function " << BF
-                 << " has invalid padding. Ignoring the function.\n";
+          this->outs() << "BOLT-INFO: function " << BF
+                       << " has invalid padding. Ignoring the function.\n";
         }
         BF.setIgnored();
       } else {
@@ -1130,8 +1172,8 @@ void BinaryContext::generateSymbolHashes() {
       // (i.e. all zeros or a "hole")
       if (!isPadding(BD)) {
         if (opts::Verbosity) {
-          errs() << "BOLT-WARNING: collision detected when hashing " << BD
-                 << " with new name (" << NewName << "), skipping.\n";
+          this->errs() << "BOLT-WARNING: collision detected when hashing " << BD
+                       << " with new name (" << NewName << "), skipping.\n";
         }
         ++NumCollisions;
       }
@@ -1141,11 +1183,11 @@ void BinaryContext::generateSymbolHashes() {
     GlobalSymbols[NewName] = &BD;
   }
   if (NumCollisions) {
-    errs() << "BOLT-WARNING: " << NumCollisions
-           << " collisions detected while hashing binary objects";
+    this->errs() << "BOLT-WARNING: " << NumCollisions
+                 << " collisions detected while hashing binary objects";
     if (!opts::Verbosity)
-      errs() << ". Use -v=1 to see the list.";
-    errs() << '\n';
+      this->errs() << ". Use -v=1 to see the list.";
+    this->errs() << '\n';
   }
 }
 
@@ -1161,8 +1203,8 @@ bool BinaryContext::registerFragment(BinaryFunction &TargetFunction,
     Function.setSimple(false);
   }
   if (opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: marking " << TargetFunction << " as a fragment of "
-           << Function << '\n';
+    this->outs() << "BOLT-INFO: marking " << TargetFunction
+                 << " as a fragment of " << Function << '\n';
   }
   return true;
 }
@@ -1276,10 +1318,11 @@ void BinaryContext::processInterproceduralReferences() {
     if (TargetFunction) {
       if (TargetFunction->isFragment() &&
           !TargetFunction->isChildOf(Function)) {
-        errs() << "BOLT-WARNING: interprocedural reference between unrelated "
-                  "fragments: "
-               << Function.getPrintName() << " and "
-               << TargetFunction->getPrintName() << '\n';
+        this->errs()
+            << "BOLT-WARNING: interprocedural reference between unrelated "
+               "fragments: "
+            << Function.getPrintName() << " and "
+            << TargetFunction->getPrintName() << '\n';
       }
       if (uint64_t Offset = Address - TargetFunction->getAddress())
         TargetFunction->addEntryPointAtOffset(Offset);
@@ -1305,9 +1348,10 @@ void BinaryContext::processInterproceduralReferences() {
       continue;
 
     if (opts::processAllFunctions()) {
-      errs() << "BOLT-ERROR: cannot process binaries with unmarked "
-             << "object in code at address 0x" << Twine::utohexstr(Address)
-             << " belonging to section " << SectionName << " in current mode\n";
+      this->errs() << "BOLT-ERROR: cannot process binaries with unmarked "
+                   << "object in code at address 0x"
+                   << Twine::utohexstr(Address) << " belonging to section "
+                   << SectionName << " in current mode\n";
       exit(1);
     }
 
@@ -1317,9 +1361,10 @@ void BinaryContext::processInterproceduralReferences() {
     // We are not going to overwrite non-simple functions, but for simple
     // ones - adjust the padding size.
     if (TargetFunction && TargetFunction->isSimple()) {
-      errs() << "BOLT-WARNING: function " << *TargetFunction
-             << " has an object detected in a padding region at address 0x"
-             << Twine::utohexstr(Address) << '\n';
+      this->errs()
+          << "BOLT-WARNING: function " << *TargetFunction
+          << " has an object detected in a padding region at address 0x"
+          << Twine::utohexstr(Address) << '\n';
       TargetFunction->setMaxSize(TargetFunction->getSize());
     }
   }
@@ -1336,7 +1381,8 @@ void BinaryContext::postProcessSymbolTable() {
          BD->getName().starts_with("DATAat")) &&
         !BD->getParent() && !BD->getSize() && !BD->isAbsolute() &&
         BD->getSection()) {
-      errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD << "\n";
+      this->errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD
+                   << "\n";
       Valid = false;
     }
   }
@@ -1592,17 +1638,18 @@ void BinaryContext::preprocessDWODebugInfo() {
       DWARFUnit *DWOCU =
           DwarfUnit->getNonSkeletonUnitDIE(false, AbsolutePath).getDwarfUnit();
       if (!DWOCU->isDWOUnit()) {
-        outs() << "BOLT-WARNING: Debug Fission: DWO debug information for "
-               << DWOName
-               << " was not retrieved and won't be updated. Please check "
-                  "relative path.\n";
+        this->outs()
+            << "BOLT-WARNING: Debug Fission: DWO debug information for "
+            << DWOName
+            << " was not retrieved and won't be updated. Please check "
+               "relative path.\n";
         continue;
       }
       DWOCUs[*DWOId] = DWOCU;
     }
   }
   if (!DWOCUs.empty())
-    outs() << "BOLT-INFO: processing split DWARF\n";
+    this->outs() << "BOLT-INFO: processing split DWARF\n";
 }
 
 void BinaryContext::preprocessDebugInfo() {
@@ -1663,8 +1710,8 @@ void BinaryContext::preprocessDebugInfo() {
   }
 
   if (opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: " << ProcessedCUs.size() << " out of "
-           << DwCtx->getNumCompileUnits() << " CUs will be updated\n";
+    this->outs() << "BOLT-INFO: " << ProcessedCUs.size() << " out of "
+                 << DwCtx->getNumCompileUnits() << " CUs will be updated\n";
   }
 
   preprocessDWODebugInfo();
@@ -2245,23 +2292,26 @@ BinaryFunction *BinaryContext::getFunctionForSymbol(const MCSymbol *Symbol,
   return BF;
 }
 
-void BinaryContext::exitWithBugReport(StringRef Message,
-                                      const BinaryFunction &Function) const {
-  errs() << "=======================================\n";
-  errs() << "BOLT is unable to proceed because it couldn't properly understand "
-            "this function.\n";
-  errs() << "If you are running the most recent version of BOLT, you may "
-            "want to "
-            "report this and paste this dump.\nPlease check that there is no "
-            "sensitive contents being shared in this dump.\n";
-  errs() << "\nOffending function: " << Function.getPrintName() << "\n\n";
-  ScopedPrinter SP(errs());
+std::string
+BinaryContext::generateBugReportMessage(StringRef Message,
+                                        const BinaryFunction &Function) const {
+  std::string Msg;
+  raw_string_ostream SS(Msg);
+  SS << "=======================================\n";
+  SS << "BOLT is unable to proceed because it couldn't properly understand "
+        "this function.\n";
+  SS << "If you are running the most recent version of BOLT, you may "
+        "want to "
+        "report this and paste this dump.\nPlease check that there is no "
+        "sensitive contents being shared in this dump.\n";
+  SS << "\nOffending function: " << Function.getPrintName() << "\n\n";
+  ScopedPrinter SP(SS);
   SP.printBinaryBlock("Function contents", *Function.getData());
-  errs() << "\n";
-  Function.dump();
-  errs() << "ERROR: " << Message;
-  errs() << "\n=======================================\n";
-  exit(1);
+  SS << "\n";
+  const_cast<BinaryFunction &>(Function).print(SS, "");
+  SS << "ERROR: " << Message;
+  SS << "\n=======================================\n";
+  return Msg;
 }
 
 BinaryFunction *
@@ -2399,9 +2449,9 @@ bool BinaryContext::validateInstructionEncoding(
   auto OutputSequence = ArrayRef<uint8_t>((uint8_t *)Code.data(), Code.size());
   if (InputSequence != OutputSequence) {
     if (opts::Verbosity > 1) {
-      errs() << "BOLT-WARNING: mismatched encoding detected\n"
-             << "      input: " << InputSequence << '\n'
-             << "     output: " << OutputSequence << '\n';
+      this->errs() << "BOLT-WARNING: mismatched encoding detected\n"
+                   << "      input: " << InputSequence << '\n'
+                   << "     output: " << OutputSequence << '\n';
     }
     return false;
   }
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 3bff3125a57a..d4b668c1d7e7 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -567,7 +567,8 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
       BF.getAddress() - BF.getOriginSection()->getAddress(), BF.getMaxSize());
 
   if (opts::Verbosity && !OnBehalfOf)
-    outs() << "BOLT-INFO: emitting constant island for function " << BF << "\n";
+    BC.outs() << "BOLT-INFO: emitting constant island for function " << BF
+              << "\n";
 
   // We split the island into smaller blocks and output labels between them.
   auto IS = Islands.Offsets.begin();
@@ -766,7 +767,7 @@ void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
     return;
 
   if (opts::PrintJumpTables)
-    outs() << "BOLT-INFO: jump tables for function " << BF << ":\n";
+    BC.outs() << "BOLT-INFO: jump tables for function " << BF << ":\n";
 
   for (auto &JTI : BF.jumpTables()) {
     JumpTable &JT = *JTI.second;
@@ -774,7 +775,7 @@ void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
     if (JT.Parents.size() > 1 && JT.Parents[0] != &BF)
       continue;
     if (opts::PrintJumpTables)
-      JT.print(outs());
+      JT.print(BC.outs());
     if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) {
       JT.updateOriginal();
     } else {
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 0ac47a53a446..54f2f9d972a4 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -880,9 +880,9 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     // internal function addresses to escape the function scope - we
     // consider it a tail call.
     if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: no section for address 0x"
-             << Twine::utohexstr(ArrayStart) << " referenced from function "
-             << *this << '\n';
+      BC.errs() << "BOLT-WARNING: no section for address 0x"
+                << Twine::utohexstr(ArrayStart) << " referenced from function "
+                << *this << '\n';
     }
     return IndirectBranchType::POSSIBLE_TAIL_CALL;
   }
@@ -899,11 +899,11 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     if (BC.getSectionForAddress(ArrayStart)->isWritable())
       return IndirectBranchType::UNKNOWN;
 
-    outs() << "BOLT-INFO: fixed indirect branch detected in " << *this
-           << " at 0x" << Twine::utohexstr(getAddress() + Offset)
-           << " referencing data at 0x" << Twine::utohexstr(ArrayStart)
-           << " the destination value is 0x" << Twine::utohexstr(*Value)
-           << '\n';
+    BC.outs() << "BOLT-INFO: fixed indirect branch detected in " << *this
+              << " at 0x" << Twine::utohexstr(getAddress() + Offset)
+              << " referencing data at 0x" << Twine::utohexstr(ArrayStart)
+              << " the destination value is 0x" << Twine::utohexstr(*Value)
+              << '\n';
 
     TargetAddress = *Value;
     return BranchType;
@@ -1021,28 +1021,29 @@ bool BinaryFunction::isZeroPaddingAt(uint64_t Offset) const {
   return true;
 }
 
-void BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
-                                        uint64_t Size) {
+Error BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
+                                         uint64_t Size) {
   auto &MIB = BC.MIB;
   uint64_t TargetAddress = 0;
   if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address,
                                      Size)) {
-    errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
-    BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, errs());
-    errs() << '\n';
-    Instruction.dump_pretty(errs(), BC.InstPrinter.get());
-    errs() << '\n';
-    errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x"
-           << Twine::utohexstr(Address) << ". Skipping function " << *this
-           << ".\n";
+    std::string Msg;
+    raw_string_ostream SS(Msg);
+    SS << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
+    BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, SS);
+    SS << '\n';
+    Instruction.dump_pretty(SS, BC.InstPrinter.get());
+    SS << '\n';
+    SS << "BOLT-ERROR: cannot handle PC-relative operand at 0x"
+       << Twine::utohexstr(Address) << ". Skipping function " << *this << ".\n";
     if (BC.HasRelocations)
-      exit(1);
+      return createFatalBOLTError(Msg);
     IsSimple = false;
-    return;
+    return createNonFatalBOLTError(Msg);
   }
   if (TargetAddress == 0 && opts::Verbosity >= 1) {
-    outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
-           << '\n';
+    BC.outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this
+              << '\n';
   }
 
   const MCSymbol *TargetSymbol;
@@ -1054,6 +1055,7 @@ void BinaryFunction::handlePCRelOperand(MCInst &Instruction, uint64_t Address,
       Instruction, TargetSymbol, static_cast<int64_t>(TargetOffset), &*BC.Ctx);
   (void)ReplaceSuccess;
   assert(ReplaceSuccess && "Failed to replace mem operand with symbol+off.");
+  return Error::success();
 }
 
 MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
@@ -1066,9 +1068,9 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
   const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
   BC.addInterproceduralReference(this, TargetAddress);
   if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !BC.HasRelocations) {
-    errs() << "BOLT-WARNING: relaxed tail call detected at 0x"
-           << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this
-           << ". Code size will be increased.\n";
+    BC.errs() << "BOLT-WARNING: relaxed tail call detected at 0x"
+              << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this
+              << ". Code size will be increased.\n";
   }
 
   assert(!MIB->isTailCall(Instruction) &&
@@ -1082,9 +1084,9 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
       assert(MIB->isConditionalBranch(Instruction) &&
              "unknown tail call instruction");
       if (opts::Verbosity >= 2) {
-        errs() << "BOLT-WARNING: conditional tail call detected in "
-               << "function " << *this << " at 0x"
-               << Twine::utohexstr(AbsoluteInstrAddr) << ".\n";
+        BC.errs() << "BOLT-WARNING: conditional tail call detected in "
+                  << "function " << *this << " at 0x"
+                  << Twine::utohexstr(AbsoluteInstrAddr) << ".\n";
       }
     }
     IsCall = true;
@@ -1094,8 +1096,8 @@ MCSymbol *BinaryFunction::handleExternalReference(MCInst &Instruction,
     // We actually see calls to address 0 in presence of weak
     // symbols originating from libraries. This code is never meant
     // to be executed.
-    outs() << "BOLT-INFO: Function " << *this
-           << " has a call to address zero.\n";
+    BC.outs() << "BOLT-INFO: Function " << *this
+              << " has a call to address zero.\n";
   }
 
   return BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat");
@@ -1164,7 +1166,7 @@ void BinaryFunction::handleAArch64IndirectCall(MCInst &Instruction,
   }
 }
 
-bool BinaryFunction::disassemble() {
+Error BinaryFunction::disassemble() {
   NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs",
                      "Build Binary Functions", opts::TimeBuild);
   ErrorOr<ArrayRef<uint8_t>> ErrorOrFunctionData = getData();
@@ -1208,10 +1210,11 @@ bool BinaryFunction::disassemble() {
       if (isZeroPaddingAt(Offset))
         break;
 
-      errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
-             << Twine::utohexstr(Offset) << " (address 0x"
-             << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
-             << '\n';
+      BC.errs()
+          << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+          << Twine::utohexstr(Offset) << " (address 0x"
+          << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
+          << '\n';
       // Some AVX-512 instructions could not be disassembled at all.
       if (BC.HasRelocations && opts::TrapOnAVX512 && BC.isX86()) {
         setTrapOnEntry();
@@ -1227,10 +1230,10 @@ bool BinaryFunction::disassemble() {
     if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) &&
         !BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) {
       if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
-        errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
-               << "function " << *this << " for instruction :\n";
-        BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
-        errs() << '\n';
+        BC.errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
+                  << "function " << *this << " for instruction :\n";
+        BC.printInstruction(BC.errs(), Instruction, AbsoluteInstrAddr);
+        BC.errs() << '\n';
       }
     }
 
@@ -1243,10 +1246,10 @@ bool BinaryFunction::disassemble() {
       }
 
       if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
-        errs() << "BOLT-WARNING: internal assembler/disassembler error "
-                  "detected for AVX512 instruction:\n";
-        BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
-        errs() << " in function " << *this << '\n';
+        BC.errs() << "BOLT-WARNING: internal assembler/disassembler error "
+                     "detected for AVX512 instruction:\n";
+        BC.printInstruction(BC.errs(), Instruction, AbsoluteInstrAddr);
+        BC.errs() << " in function " << *this << '\n';
         setIgnored();
         break;
       }
@@ -1282,9 +1285,9 @@ bool BinaryFunction::disassemble() {
               // function, so preserve the function as is for now.
               PreserveNops = true;
             } else {
-              errs() << "BOLT-WARNING: internal call detected at 0x"
-                     << Twine::utohexstr(AbsoluteInstrAddr) << " in function "
-                     << *this << ". Skipping.\n";
+              BC.errs() << "BOLT-WARNING: internal call detected at 0x"
+                        << Twine::utohexstr(AbsoluteInstrAddr)
+                        << " in function " << *this << ". Skipping.\n";
               IsSimple = false;
             }
           }
@@ -1332,8 +1335,19 @@ bool BinaryFunction::disassemble() {
         if (MIB->isIndirectBranch(Instruction))
           handleIndirectBranch(Instruction, Size, Offset);
         // Indirect call. We only need to fix it if the operand is RIP-relative.
-        if (IsSimple && MIB->hasPCRelOperand(Instruction))
-          handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+        if (IsSimple && MIB->hasPCRelOperand(Instruction)) {
+          if (auto NewE = handleErrors(
+                  handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size),
+                  [&](const BOLTError &E) -> Error {
+                    if (E.isFatal())
+                      return Error(std::make_unique<BOLTError>(std::move(E)));
+                    if (!E.getMessage().empty())
+                      E.log(BC.errs());
+                    return Error::success();
+                  })) {
+            return Error(std::move(NewE));
+          }
+        }
 
         if (BC.isAArch64())
           handleAArch64IndirectCall(Instruction, Offset);
@@ -1372,8 +1386,18 @@ bool BinaryFunction::disassemble() {
         UsedReloc = true;
       }
 
-      if (!BC.isRISCV() && MIB->hasPCRelOperand(Instruction) && !UsedReloc)
-        handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size);
+      if (!BC.isRISCV() && MIB->hasPCRelOperand(Instruction) && !UsedReloc) {
+        if (auto NewE = handleErrors(
+                handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size),
+                [&](const BOLTError &E) -> Error {
+                  if (E.isFatal())
+                    return Error(std::make_unique<BOLTError>(std::move(E)));
+                  if (!E.getMessage().empty())
+                    E.log(BC.errs());
+                  return Error::success();
+                }))
+          return Error(std::move(NewE));
+      }
     }
 
 add_instruction:
@@ -1413,12 +1437,12 @@ add_instruction:
 
   if (!IsSimple) {
     clearList(Instructions);
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   updateState(State::Disassembled);
 
-  return true;
+  return Error::success();
 }
 
 bool BinaryFunction::scanExternalRefs() {
@@ -1467,10 +1491,11 @@ bool BinaryFunction::scanExternalRefs() {
                                            FunctionData.slice(Offset),
                                            AbsoluteInstrAddr, nulls())) {
       if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) {
-        errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
-               << Twine::utohexstr(Offset) << " (address 0x"
-               << Twine::utohexstr(AbsoluteInstrAddr) << ") in function "
-               << *this << '\n';
+        BC.errs()
+            << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
+            << Twine::utohexstr(Offset) << " (address 0x"
+            << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this
+            << '\n';
       }
       Success = false;
       DisassemblyFailed = true;
@@ -1598,7 +1623,7 @@ bool BinaryFunction::scanExternalRefs() {
     HasExternalRefRelocations = true;
 
   if (opts::Verbosity >= 1 && !Success)
-    outs() << "BOLT-INFO: failed to scan refs for  " << *this << '\n';
+    BC.outs() << "BOLT-INFO: failed to scan refs for  " << *this << '\n';
 
   return Success;
 }
@@ -1631,9 +1656,9 @@ void BinaryFunction::postProcessEntryPoints() {
     if (BC.isAArch64() && Offset == getSize())
       continue;
 
-    errs() << "BOLT-WARNING: reference in the middle of instruction "
-              "detected in function "
-           << *this << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+    BC.errs() << "BOLT-WARNING: reference in the middle of instruction "
+                 "detected in function "
+              << *this << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
     if (BC.HasRelocations)
       setIgnored();
     setSimple(false);
@@ -1647,9 +1672,9 @@ void BinaryFunction::postProcessJumpTables() {
     JumpTable &JT = *JTI.second;
     if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) {
       opts::JumpTables = JTS_MOVE;
-      outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
-                "detected in function "
-             << *this << '\n';
+      BC.outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
+                   "detected in function "
+                << *this << '\n';
     }
     const uint64_t BDSize =
         BC.getBinaryDataAtAddress(JT.getAddress())->getSize();
@@ -1764,15 +1789,15 @@ bool BinaryFunction::validateExternallyReferencedOffsets() {
         continue;
 
       if (opts::Verbosity >= 1) {
-        errs() << "BOLT-WARNING: unclaimed data to code reference (possibly "
-               << "an unrecognized jump table entry) to " << BB->getName()
-               << " in " << *this << "\n";
+        BC.errs() << "BOLT-WARNING: unclaimed data to code reference (possibly "
+                  << "an unrecognized jump table entry) to " << BB->getName()
+                  << " in " << *this << "\n";
       }
       auto L = BC.scopeLock();
       addEntryPoint(*BB);
     } else {
-      errs() << "BOLT-WARNING: unknown data to code reference to offset "
-             << Twine::utohexstr(Destination) << " in " << *this << "\n";
+      BC.errs() << "BOLT-WARNING: unknown data to code reference to offset "
+                << Twine::utohexstr(Destination) << " in " << *this << "\n";
       setIgnored();
     }
     HasUnclaimedReference = true;
@@ -1872,9 +1897,9 @@ bool BinaryFunction::postProcessIndirectBranches(
       }
 
       if (opts::Verbosity >= 2) {
-        outs() << "BOLT-INFO: rejected potential indirect tail call in "
-               << "function " << *this << " in basic block " << BB.getName()
-               << ".\n";
+        BC.outs() << "BOLT-INFO: rejected potential indirect tail call in "
+                  << "function " << *this << " in basic block " << BB.getName()
+                  << ".\n";
         LLVM_DEBUG(BC.printInstructions(dbgs(), BB.begin(), BB.end(),
                                         BB.getOffset(), this, true));
       }
@@ -1946,17 +1971,17 @@ void BinaryFunction::recomputeLandingPads() {
   }
 }
 
-bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
+Error BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   auto &MIB = BC.MIB;
 
   if (!isSimple()) {
     assert(!BC.HasRelocations &&
            "cannot process file with non-simple function in relocs mode");
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   if (CurrentState != State::Disassembled)
-    return false;
+    return createNonFatalBOLTError("");
 
   assert(BasicBlocks.empty() && "basic block list should be empty");
   assert((Labels.find(getFirstInstructionOffset()) != Labels.end()) &&
@@ -2093,7 +2118,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
 
   if (BasicBlocks.empty()) {
     setSimple(false);
-    return false;
+    return createNonFatalBOLTError("");
   }
 
   // Intermediate dump.
@@ -2111,11 +2136,12 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
     BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second);
     if (!FromBB || !ToBB) {
       if (!FromBB)
-        errs() << "BOLT-ERROR: cannot find BB containing the branch.\n";
+        BC.errs() << "BOLT-ERROR: cannot find BB containing the branch.\n";
       if (!ToBB)
-        errs() << "BOLT-ERROR: cannot find BB containing branch destination.\n";
-      BC.exitWithBugReport("disassembly failed - inconsistent branch found.",
-                           *this);
+        BC.errs()
+            << "BOLT-ERROR: cannot find BB containing branch destination.\n";
+      return createFatalBOLTError(BC.generateBugReportMessage(
+          "disassembly failed - inconsistent branch found.", *this));
     }
 
     FromBB->addSuccessor(ToBB);
@@ -2193,8 +2219,8 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   // Make any necessary adjustments for indirect branches.
   if (!postProcessIndirectBranches(AllocatorId)) {
     if (opts::Verbosity) {
-      errs() << "BOLT-WARNING: failed to post-process indirect branches for "
-             << *this << '\n';
+      BC.errs() << "BOLT-WARNING: failed to post-process indirect branches for "
+                << *this << '\n';
     }
     // In relocation mode we want to keep processing the function but avoid
     // optimizing it.
@@ -2204,7 +2230,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
   clearList(ExternallyReferencedOffsets);
   clearList(UnknownIndirectBranchOffsets);
 
-  return true;
+  return Error::success();
 }
 
 void BinaryFunction::postProcessCFG() {
@@ -3034,10 +3060,6 @@ static std::string constructFilename(std::string Filename,
     Annotation.insert(0, "-");
   if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) {
     assert(Suffix.size() + Annotation.size() <= MAX_PATH);
-    if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix
-             << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n";
-    }
     Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size()));
   }
   Filename += Annotation;
@@ -3162,16 +3184,17 @@ void BinaryFunction::viewGraph() const {
   SmallString<MAX_PATH> Filename;
   if (std::error_code EC =
           sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) {
-    errs() << "BOLT-ERROR: " << EC.message() << ", unable to create "
-           << " bolt-cfg-XXXXX.dot temporary file.\n";
+    BC.errs() << "BOLT-ERROR: " << EC.message() << ", unable to create "
+              << " bolt-cfg-XXXXX.dot temporary file.\n";
     return;
   }
   dumpGraphToFile(std::string(Filename));
   if (DisplayGraph(Filename))
-    errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n";
+    BC.errs() << "BOLT-ERROR: Can't display " << Filename
+              << " with graphviz.\n";
   if (std::error_code EC = sys::fs::remove(Filename)) {
-    errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove "
-           << Filename << "\n";
+    BC.errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove "
+              << Filename << "\n";
   }
 }
 
@@ -3181,7 +3204,7 @@ void BinaryFunction::dumpGraphForPass(std::string Annotation) const {
 
   std::string Filename = constructFilename(getPrintName(), Annotation, ".dot");
   if (opts::Verbosity >= 1)
-    outs() << "BOLT-INFO: dumping CFG to " << Filename << "\n";
+    BC.outs() << "BOLT-INFO: dumping CFG to " << Filename << "\n";
   dumpGraphToFile(Filename);
 }
 
@@ -3190,8 +3213,8 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const {
   raw_fd_ostream of(Filename, EC, sys::fs::OF_None);
   if (EC) {
     if (opts::Verbosity >= 1) {
-      errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
-             << Filename << " for output.\n";
+      BC.errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
+                << Filename << " for output.\n";
     }
     return;
   }
@@ -3213,8 +3236,8 @@ bool BinaryFunction::validateCFG() const {
   // Make sure all blocks in CFG are valid.
   auto validateBlock = [this](const BinaryBasicBlock *BB, StringRef Desc) {
     if (!BB->isValid()) {
-      errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName()
-             << " detected in:\n";
+      BC.errs() << "BOLT-ERROR: deleted " << Desc << " " << BB->getName()
+                << " detected in:\n";
       this->dump();
       return false;
     }
@@ -3241,8 +3264,8 @@ bool BinaryFunction::validateCFG() const {
     std::unordered_set<const BinaryBasicBlock *> BBLandingPads;
     for (const BinaryBasicBlock *LP : BB->landing_pads()) {
       if (BBLandingPads.count(LP)) {
-        errs() << "BOLT-ERROR: duplicate landing pad detected in"
-               << BB->getName() << " in function " << *this << '\n';
+        BC.errs() << "BOLT-ERROR: duplicate landing pad detected in"
+                  << BB->getName() << " in function " << *this << '\n';
         return false;
       }
       BBLandingPads.insert(LP);
@@ -3251,8 +3274,8 @@ bool BinaryFunction::validateCFG() const {
     std::unordered_set<const BinaryBasicBlock *> BBThrowers;
     for (const BinaryBasicBlock *Thrower : BB->throwers()) {
       if (BBThrowers.count(Thrower)) {
-        errs() << "BOLT-ERROR: duplicate thrower detected in" << BB->getName()
-               << " in function " << *this << '\n';
+        BC.errs() << "BOLT-ERROR: duplicate thrower detected in"
+                  << BB->getName() << " in function " << *this << '\n';
         return false;
       }
       BBThrowers.insert(Thrower);
@@ -3260,17 +3283,18 @@ bool BinaryFunction::validateCFG() const {
 
     for (const BinaryBasicBlock *LPBlock : BB->landing_pads()) {
       if (!llvm::is_contained(LPBlock->throwers(), BB)) {
-        errs() << "BOLT-ERROR: inconsistent landing pad detected in " << *this
-               << ": " << BB->getName() << " is in LandingPads but not in "
-               << LPBlock->getName() << " Throwers\n";
+        BC.errs() << "BOLT-ERROR: inconsistent landing pad detected in "
+                  << *this << ": " << BB->getName()
+                  << " is in LandingPads but not in " << LPBlock->getName()
+                  << " Throwers\n";
         return false;
       }
     }
     for (const BinaryBasicBlock *Thrower : BB->throwers()) {
       if (!llvm::is_contained(Thrower->landing_pads(), BB)) {
-        errs() << "BOLT-ERROR: inconsistent thrower detected in " << *this
-               << ": " << BB->getName() << " is in Throwers list but not in "
-               << Thrower->getName() << " LandingPads\n";
+        BC.errs() << "BOLT-ERROR: inconsistent thrower detected in " << *this
+                  << ": " << BB->getName() << " is in Throwers list but not in "
+                  << Thrower->getName() << " LandingPads\n";
         return false;
       }
     }
diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp
index 97bc25193547..564c63e81914 100644
--- a/bolt/lib/Core/BinarySection.cpp
+++ b/bolt/lib/Core/BinarySection.cpp
@@ -198,7 +198,7 @@ BinarySection::~BinarySection() {
 
   if (!isAllocatable() && !hasValidSectionID() &&
       (!hasSectionRef() ||
-       OutputContents.data() != getContents(Section).data())) {
+       OutputContents.data() != getContentsOrQuit(Section).data())) {
     delete[] getOutputData();
   }
 }
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index 762d3419edd3..3c72c745086b 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -126,8 +126,8 @@ uint32_t DIEBuilder::allocDIE(const DWARFUnit &DU, const DWARFDie &DDie,
 void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
   std::optional<uint32_t> UnitId = getUnitId(DU);
   if (!UnitId) {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: "
-           << "Skip Unit at " << Twine::utohexstr(DU.getOffset()) << "\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: "
+              << "Skip Unit at " << Twine::utohexstr(DU.getOffset()) << "\n";
     return;
   }
 
@@ -178,8 +178,9 @@ void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
   getState().CloneUnitCtxMap[*UnitId].IsConstructed = true;
 }
 
-DIEBuilder::DIEBuilder(DWARFContext *DwarfContext, bool IsDWO)
-    : DwarfContext(DwarfContext), IsDWO(IsDWO) {}
+DIEBuilder::DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+                       bool IsDWO)
+    : BC(BC), DwarfContext(DwarfContext), IsDWO(IsDWO) {}
 
 static unsigned int getCUNum(DWARFContext *DwarfContext, bool IsDWO) {
   unsigned int CUNum = IsDWO ? DwarfContext->getNumDWOCompileUnits()
@@ -475,19 +476,21 @@ DWARFDie DIEBuilder::resolveDIEReference(
           allocDIE(*RefCU, RefDie, getState().DIEAlloc, *UnitId);
         return RefDie;
       }
-      errs() << "BOLT-WARNING: [internal-dwarf-error]: invalid referenced DIE "
-                "at offset: "
-             << Twine::utohexstr(RefOffset) << ".\n";
+      BC.errs()
+          << "BOLT-WARNING: [internal-dwarf-error]: invalid referenced DIE "
+             "at offset: "
+          << Twine::utohexstr(RefOffset) << ".\n";
 
     } else {
-      errs() << "BOLT-WARNING: [internal-dwarf-error]: could not parse "
-                "referenced DIE at offset: "
-             << Twine::utohexstr(RefOffset) << ".\n";
+      BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: could not parse "
+                   "referenced DIE at offset: "
+                << Twine::utohexstr(RefOffset) << ".\n";
     }
   } else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: could not find referenced "
-              "CU. Referenced DIE offset: "
-           << Twine::utohexstr(RefOffset) << ".\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: could not find referenced "
+           "CU. Referenced DIE offset: "
+        << Twine::utohexstr(RefOffset) << ".\n";
   }
   return DWARFDie();
 }
@@ -516,8 +519,8 @@ void DIEBuilder::cloneDieReferenceAttribute(
   if (!DieInfo.Die) {
     assert(Ref > InputDIE.getOffset());
     (void)Ref;
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: encounter unexpected "
-              "unallocated DIE. Should be alloc!\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: encounter unexpected "
+                 "unallocated DIE. Should be alloc!\n";
     // We haven't cloned this DIE yet. Just create an empty one and
     // store it. It'll get really cloned when we process it.
     DieInfo.Die = DIE::get(getState().DIEAlloc, dwarf::Tag(RefDie.getTag()));
@@ -580,8 +583,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
         (Description.Op.size() == 2 &&
          Description.Op[1] == Encoding::BaseTypeRef &&
          Description.Op[0] != Encoding::Size1))
-      outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP "
-                "encoding.\n";
+      BC.outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP "
+                   "encoding.\n";
 
     if ((Description.Op.size() == 1 &&
          Description.Op[0] == Encoding::BaseTypeRef) ||
@@ -616,9 +619,9 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
             Offset = Stage == CloneExpressionStage::INIT ? RefOffset
                                                          : Clone->getOffset();
           else
-            errs() << "BOLT-WARNING: [internal-dwarf-error]: base type ref "
-                      "doesn't point to "
-                      "DW_TAG_base_type.\n";
+            BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: base type ref "
+                         "doesn't point to "
+                         "DW_TAG_base_type.\n";
         }
       }
       uint8_t ULEB[16];
@@ -652,8 +655,9 @@ void DIEBuilder::cloneBlockAttribute(
                                    U.getVersion())) {
     Block = new (getState().DIEAlloc) DIEBlock;
   } else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unexpected Form value in "
-              "cloneBlockAttribute\n";
+    BC.errs()
+        << "BOLT-WARNING: [internal-dwarf-error]: Unexpected Form value in "
+           "cloneBlockAttribute\n";
     return;
   }
   Attr = Loc ? static_cast<DIEValueList *>(Loc)
@@ -720,9 +724,9 @@ void DIEBuilder::cloneScalarAttribute(
   else if (auto OptionalValue = Val.getAsSectionOffset())
     Value = *OptionalValue;
   else {
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
-              "attribute form. Dropping "
-              "attribute.\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
+                 "attribute form. Dropping "
+                 "attribute.\n";
     return;
   }
 
@@ -743,9 +747,9 @@ void DIEBuilder::cloneLoclistAttrubute(
   else if (auto OptionalValue = Val.getAsSectionOffset())
     Value = OptionalValue;
   else
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
-              "attribute form. Dropping "
-              "attribute.\n";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported scalar "
+                 "attribute form. Dropping "
+                 "attribute.\n";
 
   if (!Value.has_value())
     return;
@@ -808,10 +812,10 @@ void DIEBuilder::cloneAttribute(
     cloneRefsigAttribute(Die, AttrSpec, Val);
     break;
   default:
-    errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported attribute "
-              "form " +
-                  dwarf::FormEncodingString(AttrSpec.Form).str() +
-                  " in cloneAttribute. Dropping.";
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Unsupported attribute "
+                 "form " +
+                     dwarf::FormEncodingString(AttrSpec.Form).str() +
+                     " in cloneAttribute. Dropping.";
   }
 }
 void DIEBuilder::assignAbbrev(DIEAbbrev &Abbrev) {
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 415b0310b6ba..8c3f6bd2052f 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -1185,7 +1185,7 @@ static void parseAndPopulateDebugLineStr(BinarySection &LineStrSection,
     Error Err = Error::success();
     const char *CStr = StrData.getCStr(&Offset, &Err);
     if (Err) {
-      errs() << "BOLT-ERROR: could not extract string from .debug_line_str";
+      BC.errs() << "BOLT-ERROR: could not extract string from .debug_line_str";
       continue;
     }
     const size_t NewOffset = LineStr.addString(CStr);
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index 10f076915e92..5de0f9e0d6b8 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -99,7 +99,7 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
     printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
   }
   if (opts::PrintDynoOpcodeStat && Printer) {
-    outs() << "\nProgram-wide opcode histogram:\n";
+    OS << "\nProgram-wide opcode histogram:\n";
     OS << "              Opcode,   Execution Count,     Max Exec Count, "
           "Function Name:Offset ...\n";
     std::vector<std::pair<uint64_t, unsigned>> SortedHistogram;
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index ab1885f6bb58..54618aeb95cc 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -98,12 +98,12 @@ namespace bolt {
 // site table will be the same size as GCC uses uleb encodings for PC offsets.
 //
 // Note: some functions have LSDA entries with 0 call site entries.
-void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
-                               uint64_t LSDASectionAddress) {
+Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
+                                uint64_t LSDASectionAddress) {
   assert(CurrentState == State::Disassembled && "unexpected function state");
 
   if (!getLSDAAddress())
-    return;
+    return Error::success();
 
   DWARFDataExtractor Data(
       StringRef(reinterpret_cast<const char *>(LSDASectionData.data()),
@@ -119,9 +119,9 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     std::optional<uint64_t> MaybeLPStart = Data.getEncodedPointer(
         &Offset, LPStartEncoding, Offset + LSDASectionAddress);
     if (!MaybeLPStart) {
-      errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
-             << (unsigned)LPStartEncoding << '\n';
-      exit(1);
+      BC.errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
+                << (unsigned)LPStartEncoding << '\n';
+      return createFatalBOLTError("");
     }
     LPStart = *MaybeLPStart;
   }
@@ -136,13 +136,14 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
   }
 
   if (opts::PrintExceptions) {
-    outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress())
-           << " for function " << *this << "]:\n";
-    outs() << "LPStart Encoding = 0x" << Twine::utohexstr(LPStartEncoding)
-           << '\n';
-    outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n';
-    outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding) << '\n';
-    outs() << "TType End = " << TTypeEnd << '\n';
+    BC.outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress())
+              << " for function " << *this << "]:\n";
+    BC.outs() << "LPStart Encoding = 0x" << Twine::utohexstr(LPStartEncoding)
+              << '\n';
+    BC.outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n';
+    BC.outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding)
+              << '\n';
+    BC.outs() << "TType End = " << TTypeEnd << '\n';
   }
 
   // Table to store list of indices in type table. Entries are uleb128 values.
@@ -166,9 +167,9 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
   uint64_t ActionTableStart = CallSiteTableEnd;
 
   if (opts::PrintExceptions) {
-    outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n';
-    outs() << "CallSite table length = " << CallSiteTableLength << '\n';
-    outs() << '\n';
+    BC.outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n';
+    BC.outs() << "CallSite table length = " << CallSiteTableLength << '\n';
+    BC.outs() << '\n';
   }
 
   this->HasEHRanges = CallSitePtr < CallSiteTableEnd;
@@ -185,12 +186,13 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
       LandingPad += LPStart;
 
     if (opts::PrintExceptions) {
-      outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
-             << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
-             << "); landing pad: 0x" << Twine::utohexstr(LandingPad)
-             << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n";
-      outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
-             << '\n';
+      BC.outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
+                << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
+                << "); landing pad: 0x" << Twine::utohexstr(LandingPad)
+                << "; action entry: 0x" << Twine::utohexstr(ActionEntry)
+                << "\n";
+      BC.outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
+                << '\n';
     }
 
     // Create a handler entry if necessary.
@@ -209,15 +211,16 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
                "BOLT-ERROR: cannot have landing pads in different functions");
         setHasIndirectTargetToSplitFragment(true);
         BC.addFragmentsToSkip(this);
-        return;
+        return Error::success();
       }
 
       const uint64_t LPOffset = LandingPad - getAddress();
       if (!getInstructionAtOffset(LPOffset)) {
         if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LPOffset)
-                 << " not pointing to an instruction in function " << *this
-                 << " - ignoring.\n";
+          BC.errs() << "BOLT-WARNING: landing pad "
+                    << Twine::utohexstr(LPOffset)
+                    << " not pointing to an instruction in function " << *this
+                    << " - ignoring.\n";
       } else {
         auto Label = Labels.find(LPOffset);
         if (Label != Labels.end()) {
@@ -271,7 +274,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
           OS << "0x" << Twine::utohexstr(TypeAddress);
       };
       if (opts::PrintExceptions)
-        outs() << "    actions: ";
+        BC.outs() << "    actions: ";
       uint64_t ActionPtr = ActionTableStart + ActionEntry - 1;
       int64_t ActionType;
       int64_t ActionNext;
@@ -281,21 +284,21 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
         const uint32_t Self = ActionPtr;
         ActionNext = Data.getSLEB128(&ActionPtr);
         if (opts::PrintExceptions)
-          outs() << Sep << "(" << ActionType << ", " << ActionNext << ") ";
+          BC.outs() << Sep << "(" << ActionType << ", " << ActionNext << ") ";
         if (ActionType == 0) {
           if (opts::PrintExceptions)
-            outs() << "cleanup";
+            BC.outs() << "cleanup";
         } else if (ActionType > 0) {
           // It's an index into a type table.
           MaxTypeIndex =
               std::max(MaxTypeIndex, static_cast<unsigned>(ActionType));
           if (opts::PrintExceptions) {
-            outs() << "catch type ";
-            printType(ActionType, outs());
+            BC.outs() << "catch type ";
+            printType(ActionType, BC.outs());
           }
         } else { // ActionType < 0
           if (opts::PrintExceptions)
-            outs() << "filter exception types ";
+            BC.outs() << "filter exception types ";
           const char *TSep = "";
           // ActionType is a negative *byte* offset into *uleb128-encoded* table
           // of indices with base 1.
@@ -305,8 +308,8 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
           while (uint64_t Index = Data.getULEB128(&TypeIndexTablePtr)) {
             MaxTypeIndex = std::max(MaxTypeIndex, static_cast<unsigned>(Index));
             if (opts::PrintExceptions) {
-              outs() << TSep;
-              printType(Index, outs());
+              BC.outs() << TSep;
+              printType(Index, BC.outs());
               TSep = ", ";
             }
           }
@@ -319,11 +322,11 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
         ActionPtr = Self + ActionNext;
       } while (ActionNext);
       if (opts::PrintExceptions)
-        outs() << '\n';
+        BC.outs() << '\n';
     }
   }
   if (opts::PrintExceptions)
-    outs() << '\n';
+    BC.outs() << '\n';
 
   assert(TypeIndexTableStart + MaxTypeIndexTableOffset <=
              Data.getData().size() &&
@@ -354,6 +357,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     LSDATypeIndexTable =
         LSDASectionData.slice(TypeIndexTableStart, MaxTypeIndexTableOffset);
   }
+  return Error::success();
 }
 
 void BinaryFunction::updateEHRanges() {
@@ -460,7 +464,9 @@ void BinaryFunction::updateEHRanges() {
 
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 
-CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) {
+CFIReaderWriter::CFIReaderWriter(BinaryContext &BC,
+                                 const DWARFDebugFrame &EHFrame)
+    : BC(BC) {
   // Prepare FDEs for fast lookup
   for (const dwarf::FrameEntry &Entry : EHFrame.entries()) {
     const auto *CurFDE = dyn_cast<dwarf::FDE>(&Entry);
@@ -475,10 +481,10 @@ CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) {
         if (FDEI->second->getAddressRange() == 0) {
           FDEI->second = CurFDE;
         } else if (opts::Verbosity > 0) {
-          errs() << "BOLT-WARNING: different FDEs for function at 0x"
-                 << Twine::utohexstr(FDEI->first)
-                 << " detected; sizes: " << FDEI->second->getAddressRange()
-                 << " and " << CurFDE->getAddressRange() << '\n';
+          BC.errs() << "BOLT-WARNING: different FDEs for function at 0x"
+                    << Twine::utohexstr(FDEI->first)
+                    << " detected; sizes: " << FDEI->second->getAddressRange()
+                    << " and " << CurFDE->getAddressRange() << '\n';
         }
       }
     } else {
@@ -508,8 +514,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         *CurFDE.getLinkedCIE()->getPersonalityEncoding());
   }
 
-  auto decodeFrameInstruction = [&Function, &Offset, Address, CodeAlignment,
-                                 DataAlignment](
+  auto decodeFrameInstruction = [this, &Function, &Offset, Address,
+                                 CodeAlignment, DataAlignment](
                                     const CFIProgram::Instruction &Instr) {
     uint8_t Opcode = Instr.Opcode;
     if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
@@ -601,7 +607,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
     case DW_CFA_val_offset_sf:
     case DW_CFA_val_offset:
       if (opts::Verbosity >= 1) {
-        errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
       }
       return false;
     case DW_CFA_def_cfa_expression:
@@ -622,7 +628,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
     }
     case DW_CFA_MIPS_advance_loc8:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
       return false;
     case DW_CFA_GNU_window_save:
       // DW_CFA_GNU_window_save and DW_CFA_GNU_NegateRAState just use the same
@@ -633,17 +639,17 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         break;
       }
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_GNU_window_save unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_GNU_window_save unimplemented\n";
       return false;
     case DW_CFA_lo_user:
     case DW_CFA_hi_user:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: DW_CFA_*_user unimplemented\n";
+        BC.errs() << "BOLT-WARNING: DW_CFA_*_user unimplemented\n";
       return false;
     default:
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: Unrecognized CFI instruction: " << Instr.Opcode
-               << '\n';
+        BC.errs() << "BOLT-WARNING: Unrecognized CFI instruction: "
+                  << Instr.Opcode << '\n';
       return false;
     }
 
diff --git a/bolt/lib/Core/ParallelUtilities.cpp b/bolt/lib/Core/ParallelUtilities.cpp
index fb2b6dc7cd63..1a28bc4346ec 100644
--- a/bolt/lib/Core/ParallelUtilities.cpp
+++ b/bolt/lib/Core/ParallelUtilities.cpp
@@ -90,8 +90,9 @@ inline unsigned estimateTotalCost(const BinaryContext &BC,
 
   // Switch to trivial scheduling if total estimated work is zero
   if (TotalCost == 0) {
-    outs() << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
-              "switch to  trivial scheduling.\n";
+    BC.outs()
+        << "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
+           "switch to  trivial scheduling.\n";
 
     SchedPolicy = SP_TRIVIAL;
     TotalCost = BC.getBinaryFunctions().size();
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp
index 27a1377adef1..24fddbc764cb 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/ADRRelaxationPass.cpp
@@ -86,9 +86,10 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
         auto L = BC.scopeLock();
-        errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function "
-                          "{0}. Use --strict option to override\n",
-                          BF.getOneName());
+        BC.errs() << formatv(
+            "BOLT-ERROR: Cannot relax adr in non-simple function "
+            "{0}. Use --strict option to override\n",
+            BF.getOneName());
         PassFailed = true;
         return;
       }
@@ -97,9 +98,9 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
+Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
   if (!opts::AdrPassOpt || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -110,7 +111,8 @@ void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
       "ADRRelaxationPass");
 
   if (PassFailed)
-    exit(1);
+    return createFatalBOLTError("");
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index 7c387525434b..555f82a5a817 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -147,9 +147,9 @@ void AlignerPass::alignBlocks(BinaryFunction &Function,
   }
 }
 
-void AlignerPass::runOnFunctions(BinaryContext &BC) {
+Error AlignerPass::runOnFunctions(BinaryContext &BC) {
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   AlignHistogram.resize(opts::BlockAlignment);
 
@@ -179,6 +179,7 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) {
     dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: "
            << AlignedBlocksCount << '\n';
   );
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/AllocCombiner.cpp b/bolt/lib/Passes/AllocCombiner.cpp
index 6d3f2a56424c..38ef7d02a47d 100644
--- a/bolt/lib/Passes/AllocCombiner.cpp
+++ b/bolt/lib/Passes/AllocCombiner.cpp
@@ -103,17 +103,18 @@ void AllocCombinerPass::combineAdjustments(BinaryFunction &BF) {
   }
 }
 
-void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
+Error AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
   if (opts::FrameOptimization == FOP_NONE)
-    return;
+    return Error::success();
 
   runForAllWeCare(BC.getBinaryFunctions(), [&](BinaryFunction &Function) {
     combineAdjustments(Function);
   });
 
-  outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
-         << " empty spaces coalesced (dyn count: " << DynamicCountCombined
-         << ").\n";
+  BC.outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
+            << " empty spaces coalesced (dyn count: " << DynamicCountCombined
+            << ").\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/AsmDump.cpp b/bolt/lib/Passes/AsmDump.cpp
index 18d0395cbc4a..0a12eae8b5f7 100644
--- a/bolt/lib/Passes/AsmDump.cpp
+++ b/bolt/lib/Passes/AsmDump.cpp
@@ -43,7 +43,7 @@ void dumpCFI(const BinaryFunction &BF, const MCInst &Instr, AsmPrinter &MAP) {
   case MCCFIInstruction::OpRememberState:
   case MCCFIInstruction::OpRestoreState:
     if (opts::Verbosity >= 2)
-      errs()
+      BF.getBinaryContext().errs()
           << "BOLT-WARNING: AsmDump: skipping unsupported CFI instruction in "
           << BF << ".\n";
 
@@ -102,9 +102,9 @@ void dumpFunction(const BinaryFunction &BF) {
   // Make sure the new directory exists, creating it if necessary.
   if (!opts::AsmDump.empty()) {
     if (std::error_code EC = sys::fs::create_directories(opts::AsmDump)) {
-      errs() << "BOLT-ERROR: could not create directory '" << opts::AsmDump
-             << "': " << EC.message() << '\n';
-      exit(1);
+      BC.errs() << "BOLT-ERROR: could not create directory '" << opts::AsmDump
+                << "': " << EC.message() << '\n';
+      return;
     }
   }
 
@@ -115,14 +115,14 @@ void dumpFunction(const BinaryFunction &BF) {
           ? (PrintName + ".s")
           : (opts::AsmDump + sys::path::get_separator() + PrintName + ".s")
                 .str();
-  outs() << "BOLT-INFO: Dumping function assembly to " << Filename << "\n";
+  BC.outs() << "BOLT-INFO: Dumping function assembly to " << Filename << "\n";
 
   std::error_code EC;
   raw_fd_ostream OS(Filename, EC, sys::fs::OF_None);
   if (EC) {
-    errs() << "BOLT-ERROR: " << EC.message() << ", unable to open " << Filename
-           << " for output.\n";
-    exit(1);
+    BC.errs() << "BOLT-ERROR: " << EC.message() << ", unable to open "
+              << Filename << " for output.\n";
+    return;
   }
   OS.SetUnbuffered();
 
@@ -237,9 +237,10 @@ void dumpFunction(const BinaryFunction &BF) {
     dumpBinaryDataSymbols(OS, BD, LastSection);
 }
 
-void AsmDumpPass::runOnFunctions(BinaryContext &BC) {
+Error AsmDumpPass::runOnFunctions(BinaryContext &BC) {
   for (const auto &BFIt : BC.getBinaryFunctions())
     dumpFunction(BFIt.second);
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
index 28621a4640d3..2373710c9edd 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
@@ -278,13 +278,13 @@ buildCallGraph(BinaryContext &BC, CgFilterFunction Filter, bool CgFromPerfData,
   bool PrintInfo = false;
 #endif
   if (PrintInfo || opts::Verbosity > 0)
-    outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites "
-                     "(%u recursive), density = %.6lf, %u callsites not "
-                     "processed, %u callsites with invalid profile, "
-                     "used perf data for %u stale functions.\n",
-                     Cg.numNodes(), TotalCallsites, RecursiveCallsites,
-                     Cg.density(), NotProcessed, NoProfileCallsites,
-                     NumFallbacks);
+    BC.outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites "
+                        "(%u recursive), density = %.6lf, %u callsites not "
+                        "processed, %u callsites with invalid profile, "
+                        "used perf data for %u stale functions.\n",
+                        Cg.numNodes(), TotalCallsites, RecursiveCallsites,
+                        Cg.density(), NotProcessed, NoProfileCallsites,
+                        NumFallbacks);
 
   if (opts::DumpCGDot.getNumOccurrences()) {
     Cg.printDot(opts::DumpCGDot, [&](CallGraph::NodeId Id) {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index bcb12272d1b4..d2850b03e2e1 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -301,19 +301,20 @@ void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
   NumBlocksRemoved += NumRemoved;
 }
 
-void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
+Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
       [&](BinaryFunction &BF) { runOnFunction(BF); },
       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
       "NormalizeCFG");
   if (NumBlocksRemoved)
-    outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
-           << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
+    BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
+              << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
   if (NumDuplicateEdgesMerged)
-    outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
-           << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
-           << '\n';
+    BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
+              << " duplicate CFG edge"
+              << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
+  return Error::success();
 }
 
 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
@@ -339,13 +340,13 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
     auto L = BC.scopeLock();
     Modified.insert(&Function);
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: removed " << Count
-             << " dead basic block(s) accounting for " << Bytes
-             << " bytes in function " << Function << '\n';
+      BC.outs() << "BOLT-INFO: removed " << Count
+                << " dead basic block(s) accounting for " << Bytes
+                << " bytes in function " << Function << '\n';
   }
 }
 
-void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
+Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
   };
@@ -359,8 +360,9 @@ void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
       SkipPredicate, "elimininate-unreachable");
 
   if (DeletedBlocks)
-    outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
-           << DeletedBytes << " bytes of code\n";
+    BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
+              << DeletedBytes << " bytes of code\n";
+  return Error::success();
 }
 
 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
@@ -376,9 +378,9 @@ bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
   return BinaryFunctionPass::shouldOptimize(BF);
 }
 
-void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
+Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
-    return;
+    return Error::success();
 
   std::atomic_uint64_t ModifiedFuncCount(0);
   std::mutex FunctionEditDistanceMutex;
@@ -411,8 +413,9 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
   const size_t NumAllProfiledFunctions =
       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
 
-  outs() << "BOLT-INFO: basic block reordering modified layout of "
-         << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
+  BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
+            << format(
+                   "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
                    ModifiedFuncCount.load(std::memory_order_relaxed),
                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
                        NumAllProfiledFunctions,
@@ -420,7 +423,7 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
                        BC.getBinaryFunctions().size());
 
   if (opts::PrintFuncStat > 0) {
-    raw_ostream &OS = outs();
+    raw_ostream &OS = BC.outs();
     // Copy all the values into vector in order to sort them
     std::map<uint64_t, BinaryFunction &> ScoreMap;
     auto &BFs = BC.getBinaryFunctions();
@@ -452,6 +455,7 @@ void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
          << FunctionEditDistance.lookup(&Function) << "\n\n";
     }
   }
+  return Error::success();
 }
 
 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
@@ -513,7 +517,7 @@ bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
   return BF.getLayout().update(NewLayout);
 }
 
-void FixupBranches::runOnFunctions(BinaryContext &BC) {
+Error FixupBranches::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!BC.shouldEmit(Function) || !Function.isSimple())
@@ -521,15 +525,18 @@ void FixupBranches::runOnFunctions(BinaryContext &BC) {
 
     Function.fixBranches();
   }
+  return Error::success();
 }
 
-void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
+  std::atomic<bool> HasFatal{false};
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (!BF.finalizeCFIState()) {
       if (BC.HasRelocations) {
-        errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
-               << ". Exiting.\n";
-        exit(1);
+        BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
+                  << ". Exiting.\n";
+        HasFatal = true;
+        return;
       }
       BF.setSimple(false);
       return;
@@ -548,17 +555,17 @@ void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
       SkipPredicate, "FinalizeFunctions");
+  if (HasFatal)
+    return createFatalBOLTError("finalize CFI state failure");
+  return Error::success();
 }
 
-void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
+Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
   if (BC.HasRelocations)
-    return;
-
-  if (!opts::UpdateDebugSections)
-    return;
+    return Error::success();
 
   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
-  // incorrect debug info.
+  // incorrect meta data.
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     uint64_t HotSize, ColdSize;
     std::tie(HotSize, ColdSize) =
@@ -574,6 +581,8 @@ void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
       SkipFunc, "CheckLargeFunctions");
+
+  return Error::success();
 }
 
 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
@@ -581,7 +590,7 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
   return BF.isSimple() && !BF.isIgnored();
 }
 
-void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
+Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
   // Convert GnuArgsSize annotations into CFIs.
   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
     for (FunctionFragment &FF : BF->getLayout().fragments()) {
@@ -607,13 +616,14 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
       }
     }
   }
+  return Error::success();
 }
 
 // Check for dirty state in MCSymbol objects that might be a consequence
 // of running calculateEmittedSize() in parallel, during split functions
 // pass. If an inconsistent state is found (symbol already registered or
 // already defined), clean it.
-void CleanMCState::runOnFunctions(BinaryContext &BC) {
+Error CleanMCState::runOnFunctions(BinaryContext &BC) {
   MCContext &Ctx = *BC.Ctx;
   for (const auto &SymMapEntry : Ctx.getSymbols()) {
     const MCSymbol *S = SymMapEntry.second;
@@ -631,6 +641,7 @@ void CleanMCState::runOnFunctions(BinaryContext &BC) {
       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
     });
   }
+  return Error::success();
 }
 
 // This peephole fixes jump instructions that jump to another basic
@@ -966,9 +977,9 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
   return NumLocalCTCs > 0;
 }
 
-void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
+Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -983,16 +994,17 @@ void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumTailCallsPatched)
-    outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
-           << " tail calls (" << NumOrigForwardBranches << " forward)"
-           << " tail calls (" << NumOrigBackwardBranches << " backward)"
-           << " from a total of " << NumCandidateTailCalls << " while removing "
-           << NumDoubleJumps << " double jumps"
-           << " and removing " << DeletedBlocks << " basic blocks"
-           << " totalling " << DeletedBytes
-           << " bytes of code. CTCs total execution count is " << CTCExecCount
-           << " and the number of times CTCs are taken is " << CTCTakenCount
-           << "\n";
+    BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
+              << " tail calls (" << NumOrigForwardBranches << " forward)"
+              << " tail calls (" << NumOrigBackwardBranches << " backward)"
+              << " from a total of " << NumCandidateTailCalls
+              << " while removing " << NumDoubleJumps << " double jumps"
+              << " and removing " << DeletedBlocks << " basic blocks"
+              << " totalling " << DeletedBytes
+              << " bytes of code. CTCs total execution count is "
+              << CTCExecCount << " and the number of times CTCs are taken is "
+              << CTCTakenCount << "\n";
+  return Error::success();
 }
 
 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
@@ -1009,10 +1021,10 @@ uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
 
       if (opts::Verbosity > 2) {
         BC.scopeLock();
-        outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
-        BC.printInstruction(outs(), OriginalInst, 0, &Function);
-        outs() << "BOLT-INFO: to:";
-        BC.printInstruction(outs(), Inst, 0, &Function);
+        BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
+        BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
+        BC.outs() << "BOLT-INFO: to:";
+        BC.printInstruction(BC.outs(), Inst, 0, &Function);
       }
 
       ++Count;
@@ -1022,10 +1034,10 @@ uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
   return Count;
 }
 
-void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
+Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
   std::atomic<uint64_t> NumShortened{0};
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
@@ -1033,7 +1045,9 @@ void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
       nullptr, "ShortenInstructions");
 
   if (NumShortened)
-    outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
+    BC.outs() << "BOLT-INFO: " << NumShortened
+              << " instructions were shortened\n";
+  return Error::success();
 }
 
 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
@@ -1076,12 +1090,12 @@ void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
   }
 }
 
-void Peepholes::runOnFunctions(BinaryContext &BC) {
+Error Peepholes::runOnFunctions(BinaryContext &BC) {
   const char Opts =
       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
                       [](const char A, const PeepholeOpts B) { return A | B; });
   if (Opts == PEEP_NONE)
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -1095,12 +1109,13 @@ void Peepholes::runOnFunctions(BinaryContext &BC) {
       assert(Function.validateCFG());
     }
   }
-  outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
-         << " double jumps patched.\n"
-         << "BOLT-INFO: Peephole: " << TailCallTraps
-         << " tail call traps inserted.\n"
-         << "BOLT-INFO: Peephole: " << NumUselessCondBranches
-         << " useless conditional branches removed.\n";
+  BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
+            << " double jumps patched.\n"
+            << "BOLT-INFO: Peephole: " << TailCallTraps
+            << " tail call traps inserted.\n"
+            << "BOLT-INFO: Peephole: " << NumUselessCondBranches
+            << " useless conditional branches removed.\n";
+  return Error::success();
 }
 
 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
@@ -1185,21 +1200,23 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
   return NumLocalLoadsSimplified > 0;
 }
 
-void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
+Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
       Modified.insert(&Function);
   }
 
-  outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
-         << NumLoadsFound << " loads from a statically computed address.\n"
-         << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
-         << "\n"
-         << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
+  BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
+            << NumLoadsFound << " loads from a statically computed address.\n"
+            << "BOLT-INFO: dynamic loads simplified: "
+            << NumDynamicLoadsSimplified << "\n"
+            << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
+            << "\n";
+  return Error::success();
 }
 
-void AssignSections::runOnFunctions(BinaryContext &BC) {
+Error AssignSections::runOnFunctions(BinaryContext &BC) {
   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
@@ -1207,7 +1224,7 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
 
   // In non-relocation mode functions have pre-assigned section names.
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   const bool UseColdSection =
       BC.NumProfiledFuncs > 0 ||
@@ -1228,9 +1245,10 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
     if (Function.isSplit())
       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
   }
+  return Error::success();
 }
 
-void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
+Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
   double FlowImbalanceMean = 0.0;
   size_t NumBlocksConsidered = 0;
   double WorstBias = 0.0;
@@ -1314,16 +1332,17 @@ void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
   }
 
   // Report to user
-  outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
-                   (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
+  BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
+                      (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
   if (WorstBiasFunc && opts::Verbosity >= 1) {
-    outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
-           << "\n";
+    BC.outs() << "Worst average bias observed in "
+              << WorstBiasFunc->getPrintName() << "\n";
     LLVM_DEBUG(WorstBiasFunc->dump());
   }
+  return Error::success();
 }
 
-void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
+Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   uint64_t NumRegularFunctions = 0;
   uint64_t NumStaleProfileFunctions = 0;
   uint64_t NumAllStaleFunctions = 0;
@@ -1354,7 +1373,7 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       if (opts::PrintUnknownCFG)
         Function.dump();
       else if (opts::PrintUnknown)
-        errs() << "function with unknown control flow: " << Function << '\n';
+        BC.errs() << "function with unknown control flow: " << Function << '\n';
 
       ++NumUnknownControlFlowFunctions;
     }
@@ -1374,9 +1393,9 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       }
     } else {
       if (opts::ReportStaleFuncs) {
-        outs() << StaleFuncsHeader;
+        BC.outs() << StaleFuncsHeader;
         StaleFuncsHeader = "";
-        outs() << "  " << Function << '\n';
+        BC.outs() << "  " << Function << '\n';
       }
       ++NumStaleProfileFunctions;
       StaleSampleCount += SampleCount;
@@ -1388,15 +1407,15 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
 
   const size_t NumAllProfiledFunctions =
       ProfiledFunctions.size() + NumStaleProfileFunctions;
-  outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
-         << NumRegularFunctions << " functions in the binary ("
-         << format("%.1f", NumAllProfiledFunctions /
-                               (float)NumRegularFunctions * 100.0f)
-         << "%) have non-empty execution profile\n";
+  BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
+            << NumRegularFunctions << " functions in the binary ("
+            << format("%.1f", NumAllProfiledFunctions /
+                                  (float)NumRegularFunctions * 100.0f)
+            << "%) have non-empty execution profile\n";
   if (NumNonSimpleProfiledFunctions) {
-    outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
-           << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
-           << " with profile could not be optimized\n";
+    BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
+              << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
+              << " with profile could not be optimized\n";
   }
   if (NumAllStaleFunctions) {
     const float PctStale =
@@ -1409,52 +1428,54 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         BC.Stats.NumStaleBlocks * 100.0f;
     auto printErrorOrWarning = [&]() {
       if (PctStale > opts::StaleThreshold)
-        errs() << "BOLT-ERROR: ";
+        BC.errs() << "BOLT-ERROR: ";
       else
-        errs() << "BOLT-WARNING: ";
+        BC.errs() << "BOLT-WARNING: ";
     };
     printErrorOrWarning();
-    errs() << NumAllStaleFunctions
-           << format(" (%.1f%% of all profiled)", PctStale) << " function"
-           << (NumAllStaleFunctions == 1 ? "" : "s")
-           << " have invalid (possibly stale) profile."
-              " Use -report-stale to see the list.\n";
+    BC.errs() << NumAllStaleFunctions
+              << format(" (%.1f%% of all profiled)", PctStale) << " function"
+              << (NumAllStaleFunctions == 1 ? "" : "s")
+              << " have invalid (possibly stale) profile."
+                 " Use -report-stale to see the list.\n";
     if (TotalSampleCount > 0) {
       printErrorOrWarning();
-      errs() << (StaleSampleCount + InferredSampleCount) << " out of "
-             << TotalSampleCount << " samples in the binary ("
-             << format("%.1f",
-                       ((100.0f * (StaleSampleCount + InferredSampleCount)) /
-                        TotalSampleCount))
-             << "%) belong to functions with invalid"
-                " (possibly stale) profile.\n";
+      BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
+                << TotalSampleCount << " samples in the binary ("
+                << format("%.1f",
+                          ((100.0f * (StaleSampleCount + InferredSampleCount)) /
+                           TotalSampleCount))
+                << "%) belong to functions with invalid"
+                   " (possibly stale) profile.\n";
     }
-    outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
-           << " stale function"
-           << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
-           << format(" (%.1f%% of all stale)", PctStaleFuncsWithEqualBlockCount)
-           << " have matching block count.\n";
-    outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
-           << " stale block"
-           << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
-           << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
-           << " have matching icount.\n";
+    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
+              << " stale function"
+              << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
+              << format(" (%.1f%% of all stale)",
+                        PctStaleFuncsWithEqualBlockCount)
+              << " have matching block count.\n";
+    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
+              << " stale block"
+              << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
+              << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
+              << " have matching icount.\n";
     if (PctStale > opts::StaleThreshold) {
-      errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
-             << opts::StaleThreshold << "%. Exiting.\n";
-      exit(1);
+      return createFatalBOLTError(
+          Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
+          Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
     }
   }
   if (NumInferredFunctions) {
-    outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
-                     "%.2f%% of stale) functions responsible for %.2f%% samples"
-                     " (%zu out of %zu)\n",
-                     NumInferredFunctions,
-                     100.0 * NumInferredFunctions / NumAllProfiledFunctions,
-                     100.0 * NumInferredFunctions / NumAllStaleFunctions,
-                     100.0 * InferredSampleCount / TotalSampleCount,
-                     InferredSampleCount, TotalSampleCount);
-    outs() << format(
+    BC.outs() << format(
+        "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
+        "%.2f%% of stale) functions responsible for %.2f%% samples"
+        " (%zu out of %zu)\n",
+        NumInferredFunctions,
+        100.0 * NumInferredFunctions / NumAllProfiledFunctions,
+        100.0 * NumInferredFunctions / NumAllStaleFunctions,
+        100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
+        TotalSampleCount);
+    BC.outs() << format(
         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
@@ -1465,13 +1486,13 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
-    outs() << "BOLT-INFO: profile for " << NumUnusedObjects
-           << " objects was ignored\n";
+    BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
+              << " objects was ignored\n";
   }
 
   if (ProfiledFunctions.size() > 10) {
     if (opts::Verbosity >= 1) {
-      outs() << "BOLT-INFO: top called functions are:\n";
+      BC.outs() << "BOLT-INFO: top called functions are:\n";
       llvm::sort(ProfiledFunctions,
                  [](const BinaryFunction *A, const BinaryFunction *B) {
                    return B->getExecutionCount() < A->getExecutionCount();
@@ -1480,7 +1501,8 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
       auto SFIend = ProfiledFunctions.end();
       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
            ++SFI, ++I)
-        outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
+        BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
+                  << '\n';
     }
   }
 
@@ -1520,70 +1542,70 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
           });
     }
 
-    outs() << "BOLT-INFO: top functions sorted by ";
+    BC.outs() << "BOLT-INFO: top functions sorted by ";
     if (SortAll) {
-      outs() << "dyno stats";
+      BC.outs() << "dyno stats";
     } else {
-      outs() << "(";
+      BC.outs() << "(";
       bool PrintComma = false;
       for (const DynoStats::Category Category : opts::PrintSortedBy) {
         if (PrintComma)
-          outs() << ", ";
-        outs() << DynoStats::Description(Category);
+          BC.outs() << ", ";
+        BC.outs() << DynoStats::Description(Category);
         PrintComma = true;
       }
-      outs() << ")";
+      BC.outs() << ")";
     }
 
-    outs() << " are:\n";
+    BC.outs() << " are:\n";
     auto SFI = Functions.begin();
     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
       const DynoStats Stats = getDynoStats(**SFI);
-      outs() << "  " << **SFI;
+      BC.outs() << "  " << **SFI;
       if (!SortAll) {
-        outs() << " (";
+        BC.outs() << " (";
         bool PrintComma = false;
         for (const DynoStats::Category Category : opts::PrintSortedBy) {
           if (PrintComma)
-            outs() << ", ";
-          outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
+            BC.outs() << ", ";
+          BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
           PrintComma = true;
         }
-        outs() << ")";
+        BC.outs() << ")";
       }
-      outs() << "\n";
+      BC.outs() << "\n";
     }
   }
 
   if (!BC.TrappedFunctions.empty()) {
-    errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
-           << (BC.TrappedFunctions.size() > 1 ? "s" : "")
-           << " will trap on entry. Use -trap-avx512=0 to disable"
-              " traps.";
+    BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
+              << (BC.TrappedFunctions.size() > 1 ? "s" : "")
+              << " will trap on entry. Use -trap-avx512=0 to disable"
+                 " traps.";
     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
-      errs() << '\n';
+      BC.errs() << '\n';
       for (const BinaryFunction *Function : BC.TrappedFunctions)
-        errs() << "  " << *Function << '\n';
+        BC.errs() << "  " << *Function << '\n';
     } else {
-      errs() << " Use -v=1 to see the list.\n";
+      BC.errs() << " Use -v=1 to see the list.\n";
     }
   }
 
   // Print information on missed macro-fusion opportunities seen on input.
   if (BC.Stats.MissedMacroFusionPairs) {
-    outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
-                     " opportunities for macro-fusion optimization",
-                     BC.Stats.MissedMacroFusionPairs,
-                     BC.Stats.MissedMacroFusionExecCount);
+    BC.outs() << format(
+        "BOLT-INFO: the input contains %zu (dynamic count : %zu)"
+        " opportunities for macro-fusion optimization",
+        BC.Stats.MissedMacroFusionPairs, BC.Stats.MissedMacroFusionExecCount);
     switch (opts::AlignMacroOpFusion) {
     case MFT_NONE:
-      outs() << ". Use -align-macro-fusion to fix.\n";
+      BC.outs() << ". Use -align-macro-fusion to fix.\n";
       break;
     case MFT_HOT:
-      outs() << ". Will fix instances on a hot path.\n";
+      BC.outs() << ". Will fix instances on a hot path.\n";
       break;
     case MFT_ALL:
-      outs() << " that are going to be fixed\n";
+      BC.outs() << " that are going to be fixed\n";
       break;
     }
   }
@@ -1618,36 +1640,38 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
                           B->getKnownExecutionCount() / B->getSize();
                  });
 
-      outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
-             << " functions have "
-                "cold code in the middle of hot code. Top functions are:\n";
+      BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
+                << " functions have "
+                   "cold code in the middle of hot code. Top functions are:\n";
       for (unsigned I = 0;
            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
                         SuboptimalFuncs.size());
            ++I)
-        SuboptimalFuncs[I]->print(outs());
+        SuboptimalFuncs[I]->print(BC.outs());
     }
   }
 
   if (NumUnknownControlFlowFunctions) {
-    outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
-           << " functions have instructions with unknown control flow";
+    BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
+              << " functions have instructions with unknown control flow";
     if (!opts::PrintUnknown)
-      outs() << ". Use -print-unknown to see the list.";
-    outs() << '\n';
+      BC.outs() << ". Use -print-unknown to see the list.";
+    BC.outs() << '\n';
   }
+  return Error::success();
 }
 
-void InstructionLowering::runOnFunctions(BinaryContext &BC) {
+Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
   for (auto &BFI : BC.getBinaryFunctions())
     for (BinaryBasicBlock &BB : BFI.second)
       for (MCInst &Instruction : BB)
         BC.MIB->lowerTailCall(Instruction);
+  return Error::success();
 }
 
-void StripRepRet::runOnFunctions(BinaryContext &BC) {
+Error StripRepRet::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumPrefixesRemoved = 0;
   uint64_t NumBytesSaved = 0;
@@ -1664,15 +1688,16 @@ void StripRepRet::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumBytesSaved)
-    outs() << "BOLT-INFO: removed " << NumBytesSaved
-           << " 'repz' prefixes"
-              " with estimated execution count of "
-           << NumPrefixesRemoved << " times.\n";
+    BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
+              << " 'repz' prefixes"
+                 " with estimated execution count of "
+              << NumPrefixesRemoved << " times.\n";
+  return Error::success();
 }
 
-void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
+Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumInlined = 0;
   uint64_t NumInlinedDyno = 0;
@@ -1711,12 +1736,13 @@ void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumInlined) {
-    outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
+    BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
     if (NumInlinedDyno)
-      outs() << ". The calls were executed " << NumInlinedDyno
-             << " times based on profile.";
-    outs() << '\n';
+      BC.outs() << ". The calls were executed " << NumInlinedDyno
+                << " times based on profile.";
+    BC.outs() << '\n';
   }
+  return Error::success();
 }
 
 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
@@ -1757,9 +1783,9 @@ std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
   return Sites;
 }
 
-void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
+Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   uint64_t NumSpecialized = 0;
   uint64_t NumSpecializedDyno = 0;
@@ -1844,13 +1870,14 @@ void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
   }
 
   if (NumSpecialized) {
-    outs() << "BOLT-INFO: specialized " << NumSpecialized
-           << " memcpy() call sites for size 1";
+    BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
+              << " memcpy() call sites for size 1";
     if (NumSpecializedDyno)
-      outs() << ". The calls were executed " << NumSpecializedDyno
-             << " times based on profile.";
-    outs() << '\n';
+      BC.outs() << ". The calls were executed " << NumSpecializedDyno
+                << " times based on profile.";
+    BC.outs() << '\n';
   }
+  return Error::success();
 }
 
 void RemoveNops::runOnFunction(BinaryFunction &BF) {
@@ -1864,7 +1891,7 @@ void RemoveNops::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void RemoveNops::runOnFunctions(BinaryContext &BC) {
+Error RemoveNops::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
   };
@@ -1876,6 +1903,7 @@ void RemoveNops::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
       SkipFunc, "RemoveNops");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/CMOVConversion.cpp b/bolt/lib/Passes/CMOVConversion.cpp
index 6213479a5090..2492ff217946 100644
--- a/bolt/lib/Passes/CMOVConversion.cpp
+++ b/bolt/lib/Passes/CMOVConversion.cpp
@@ -168,14 +168,14 @@ int calculateConditionBias(const BinaryBasicBlock &BB,
   return -1;
 }
 
-void CMOVConversion::Stats::dump() {
-  outs() << "converted static " << StaticPerformed << "/" << StaticPossible
-         << formatv(" ({0:P}) ", getStaticRatio())
-         << "hammock(s) into CMOV sequences, with dynamic execution count "
-         << DynamicPerformed << "/" << DynamicPossible
-         << formatv(" ({0:P}), ", getDynamicRatio()) << "saving " << RemovedMP
-         << "/" << PossibleMP << formatv(" ({0:P}) ", getMPRatio())
-         << "mispredictions\n";
+void CMOVConversion::Stats::dumpTo(raw_ostream &OS) {
+  OS << "converted static " << StaticPerformed << "/" << StaticPossible
+     << formatv(" ({0:P}) ", getStaticRatio())
+     << "hammock(s) into CMOV sequences, with dynamic execution count "
+     << DynamicPerformed << "/" << DynamicPossible
+     << formatv(" ({0:P}), ", getDynamicRatio()) << "saving " << RemovedMP
+     << "/" << PossibleMP << formatv(" ({0:P}) ", getMPRatio())
+     << "mispredictions\n";
 }
 
 void CMOVConversion::runOnFunction(BinaryFunction &Function) {
@@ -265,13 +265,13 @@ void CMOVConversion::runOnFunction(BinaryFunction &Function) {
   if (Modified)
     Function.eraseInvalidBBs();
   if (opts::Verbosity > 1) {
-    outs() << "BOLT-INFO: CMOVConversion: " << Function << ", ";
-    Local.dump();
+    BC.outs() << "BOLT-INFO: CMOVConversion: " << Function << ", ";
+    Local.dumpTo(BC.outs());
   }
   Global = Global + Local;
 }
 
-void CMOVConversion::runOnFunctions(BinaryContext &BC) {
+Error CMOVConversion::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!shouldOptimize(Function))
@@ -279,8 +279,9 @@ void CMOVConversion::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: CMOVConversion total: ";
-  Global.dump();
+  BC.outs() << "BOLT-INFO: CMOVConversion total: ";
+  Global.dumpTo(BC.outs());
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/CacheMetrics.cpp b/bolt/lib/Passes/CacheMetrics.cpp
index f6708997a433..b02d4303110b 100644
--- a/bolt/lib/Passes/CacheMetrics.cpp
+++ b/bolt/lib/Passes/CacheMetrics.cpp
@@ -189,7 +189,8 @@ double expectedCacheHitRatio(
 
 } // namespace
 
-void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
+void CacheMetrics::printAll(raw_ostream &OS,
+                            const std::vector<BinaryFunction *> &BFs) {
   // Stats related to hot-cold code splitting
   size_t NumFunctions = 0;
   size_t NumProfiledFunctions = 0;
@@ -222,36 +223,36 @@ void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
     }
   }
 
-  outs() << format("  There are %zu functions;", NumFunctions)
-         << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
-                   100.0 * NumHotFunctions / NumFunctions)
-         << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
-                   100.0 * NumProfiledFunctions / NumFunctions);
-  outs() << format("  There are %zu basic blocks;", NumBlocks)
-         << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
-                   100.0 * NumHotBlocks / NumBlocks);
+  OS << format("  There are %zu functions;", NumFunctions)
+     << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
+               100.0 * NumHotFunctions / NumFunctions)
+     << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
+               100.0 * NumProfiledFunctions / NumFunctions);
+  OS << format("  There are %zu basic blocks;", NumBlocks)
+     << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
+               100.0 * NumHotBlocks / NumBlocks);
 
   assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
   size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
   size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
 
   size_t HugePage2MB = 2 << 20;
-  outs() << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
-                   "%.2lf huge pages)\n",
-                   100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
-                   TotalCodeSize, double(HotCodeSize) / HugePage2MB);
+  OS << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
+               "%.2lf huge pages)\n",
+               100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize,
+               double(HotCodeSize) / HugePage2MB);
 
   // Stats related to expected cache performance
   std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
   std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
   extractBasicBlockInfo(BFs, BBAddr, BBSize);
 
-  outs() << "  Expected i-TLB cache hit ratio: "
-         << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
+  OS << "  Expected i-TLB cache hit ratio: "
+     << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
 
   auto Stats = calcTSPScore(BFs, BBAddr, BBSize);
-  outs() << "  TSP score: "
-         << format("%.2lf%% (%zu out of %zu)\n",
-                   100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
-                   Stats.first, Stats.second);
+  OS << "  TSP score: "
+     << format("%.2lf%% (%zu out of %zu)\n",
+               100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
+               Stats.first, Stats.second);
 }
diff --git a/bolt/lib/Passes/FixRISCVCallsPass.cpp b/bolt/lib/Passes/FixRISCVCallsPass.cpp
index e2984deda16d..83c745facb29 100644
--- a/bolt/lib/Passes/FixRISCVCallsPass.cpp
+++ b/bolt/lib/Passes/FixRISCVCallsPass.cpp
@@ -68,9 +68,9 @@ void FixRISCVCallsPass::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
+Error FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
   if (!BC.isRISCV() || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -79,6 +79,8 @@ void FixRISCVCallsPass::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, nullptr,
       "FixRISCVCalls");
+
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/FixRelaxationPass.cpp b/bolt/lib/Passes/FixRelaxationPass.cpp
index 3dd19b6b43b7..a49fb9894e80 100644
--- a/bolt/lib/Passes/FixRelaxationPass.cpp
+++ b/bolt/lib/Passes/FixRelaxationPass.cpp
@@ -47,9 +47,9 @@ void FixRelaxations::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void FixRelaxations::runOnFunctions(BinaryContext &BC) {
+Error FixRelaxations::runOnFunctions(BinaryContext &BC) {
   if (!BC.isAArch64() || !BC.HasRelocations)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     runOnFunction(BF);
@@ -58,6 +58,7 @@ void FixRelaxations::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, nullptr,
       "FixRelaxations");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index 1e6be498a479..7f1245e39f56 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -124,7 +124,7 @@ class FrameAccessAnalysis {
     if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) {
       LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
       LLVM_DEBUG(dbgs() << "Blame insn: ");
-      LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false));
+      LLVM_DEBUG(BC.printInstruction(dbgs(), Inst, 0, &BF, true, false, false));
       LLVM_DEBUG(Inst.dump());
       return false;
     }
@@ -570,13 +570,14 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
 }
 
 void FrameAnalysis::printStats() {
-  outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
-         << " function(s) were not optimized.\n"
-         << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
-         << " function(s) "
-         << format("(%.1lf%% dyn cov)",
+  BC.outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
+            << " function(s) were not optimized.\n"
+            << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
+            << " function(s) "
+            << format(
+                   "(%.1lf%% dyn cov)",
                    (100.0 * CountFunctionsFailedRestoreFI / CountDenominator))
-         << " could not have its frame indices restored.\n";
+            << " could not have its frame indices restored.\n";
 }
 
 void FrameAnalysis::clearSPTMap() {
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 6f6dea08950a..fb5f8eafa5cf 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -221,9 +221,9 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
     LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
 }
 
-void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
+Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
   if (opts::FrameOptimization == FOP_NONE)
-    return;
+    return Error::success();
 
   std::unique_ptr<BinaryFunctionCallGraph> CG;
   std::unique_ptr<FrameAnalysis> FA;
@@ -285,29 +285,31 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
   {
     NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
                         "FOP breakdown", opts::TimeOpts);
-    performShrinkWrapping(*RA, *FA, BC);
+    if (Error E = performShrinkWrapping(*RA, *FA, BC))
+      return Error(std::move(E));
   }
 
-  outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
-         << " redundant load(s) and " << NumRedundantStores
-         << " unused store(s)\n";
-  outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
-         << " and frequency of unused stores is " << FreqRedundantStores
-         << "\n";
-  outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
-         << FreqLoadsChangedToReg
-         << " and frequency of loads changed to use an immediate is "
-         << FreqLoadsChangedToImm << "\n";
-  outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
-         << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
-         << NumRedundantStores << " store(s)\n";
+  BC.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
+            << " redundant load(s) and " << NumRedundantStores
+            << " unused store(s)\n";
+  BC.outs() << "BOLT-INFO: Frequency of redundant loads is "
+            << FreqRedundantLoads << " and frequency of unused stores is "
+            << FreqRedundantStores << "\n";
+  BC.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
+            << FreqLoadsChangedToReg
+            << " and frequency of loads changed to use an immediate is "
+            << FreqLoadsChangedToImm << "\n";
+  BC.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
+            << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
+            << NumRedundantStores << " store(s)\n";
   FA->printStats();
-  ShrinkWrapping::printStats();
+  ShrinkWrapping::printStats(BC);
+  return Error::success();
 }
 
-void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
-                                               const FrameAnalysis &FA,
-                                               BinaryContext &BC) {
+Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
+                                                const FrameAnalysis &FA,
+                                                BinaryContext &BC) {
   // Initialize necessary annotations to allow safe parallel accesses to
   // annotation index in MIB
   BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
@@ -357,12 +359,21 @@ void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
 
   const bool HotOnly = opts::FrameOptimization == FOP_HOT;
 
+  Error SWError = Error::success();
+
   ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
         DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
         ShrinkWrapping SW(FA, BF, Info, AllocatorId);
 
-        if (SW.perform(HotOnly)) {
+        auto ChangedOrErr = SW.perform(HotOnly);
+        if (auto E = ChangedOrErr.takeError()) {
+          std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
+          SWError = joinErrors(std::move(SWError), Error(std::move(E)));
+          return;
+        }
+        const bool Changed = *ChangedOrErr;
+        if (Changed) {
           std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
           FuncsChanged.insert(&BF);
           LLVM_DEBUG(LogFunc(BF));
@@ -374,10 +385,11 @@ void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
       SkipPredicate, "shrink-wrapping");
 
   if (!Top10Funcs.empty()) {
-    outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
+    BC.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
     for (const auto &Elmt : Top10Funcs)
-      outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
+      BC.outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
   }
+  return SWError;
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/Hugify.cpp b/bolt/lib/Passes/Hugify.cpp
index d2a64fb97c19..b77356153bfd 100644
--- a/bolt/lib/Passes/Hugify.cpp
+++ b/bolt/lib/Passes/Hugify.cpp
@@ -16,10 +16,10 @@ using namespace llvm;
 namespace llvm {
 namespace bolt {
 
-void HugePage::runOnFunctions(BinaryContext &BC) {
+Error HugePage::runOnFunctions(BinaryContext &BC) {
   auto *RtLibrary = BC.getRuntimeLibrary();
   if (!RtLibrary || !BC.isELF() || !BC.StartFunctionAddress) {
-    return;
+    return Error::success();
   }
 
   auto createSimpleFunction =
@@ -45,6 +45,7 @@ void HugePage::runOnFunctions(BinaryContext &BC) {
   const MCSymbol *StartSym = Start->getSymbol();
   createSimpleFunction("__bolt_hugify_start_program",
                        BC.MIB->createSymbolTrampoline(StartSym, BC.Ctx.get()));
+  return Error::success();
 }
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Passes/IdenticalCodeFolding.cpp b/bolt/lib/Passes/IdenticalCodeFolding.cpp
index dfbc72e48e5d..9f8d82b05ccf 100644
--- a/bolt/lib/Passes/IdenticalCodeFolding.cpp
+++ b/bolt/lib/Passes/IdenticalCodeFolding.cpp
@@ -341,7 +341,7 @@ typedef std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>,
 namespace llvm {
 namespace bolt {
 
-void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
+Error IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
   const size_t OriginalFunctionCount = BC.getBinaryFunctions().size();
   uint64_t NumFunctionsFolded = 0;
   std::atomic<uint64_t> NumJTFunctionsFolded{0};
@@ -508,14 +508,16 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
   });
 
   if (NumFunctionsFolded)
-    outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded << " out of "
-           << OriginalFunctionCount << " functions in " << Iteration
-           << " passes. " << NumJTFunctionsFolded
-           << " functions had jump tables.\n"
-           << "BOLT-INFO: Removing all identical functions will save "
-           << format("%.2lf", (double)BytesSavedEstimate / 1024)
-           << " KB of code space. Folded functions were called " << NumCalled
-           << " times based on profile.\n";
+    BC.outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded << " out of "
+              << OriginalFunctionCount << " functions in " << Iteration
+              << " passes. " << NumJTFunctionsFolded
+              << " functions had jump tables.\n"
+              << "BOLT-INFO: Removing all identical functions will save "
+              << format("%.2lf", (double)BytesSavedEstimate / 1024)
+              << " KB of code space. Folded functions were called " << NumCalled
+              << " times based on profile.\n";
+
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 451758161ef5..55eede641fd2 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -171,9 +171,10 @@ static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
         if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
           if (BB.getKnownExecutionCount() == 0 ||
               SuccBB->getKnownExecutionCount() == 0) {
-            errs() << "BOLT-WARNING: profile verification failed after ICP for "
-                      "function "
-                   << BF << '\n';
+            BF.getBinaryContext().errs()
+                << "BOLT-WARNING: profile verification failed after ICP for "
+                   "function "
+                << BF << '\n';
             IsValid = false;
           }
         }
@@ -526,6 +527,7 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
                                              size_t &N, BinaryBasicBlock &BB,
                                              MCInst &CallInst,
                                              MCInst *&TargetFetchInst) const {
+  const BinaryContext &BC = BB.getFunction()->getBinaryContext();
   const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
   SymTargetsType SymTargets;
 
@@ -556,8 +558,9 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
   if (!HotTargets.empty()) {
     if (opts::Verbosity >= 1)
       for (size_t I = 0; I < HotTargets.size(); ++I)
-        outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first
-               << ", " << HotTargets[I].second << ")\n";
+        BC.outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
+                  << HotTargets[I].first << ", " << HotTargets[I].second
+                  << ")\n";
 
     // Recompute hottest targets, now discriminating which index is hot
     // NOTE: This is a tradeoff. On one hand, we get index information. On the
@@ -611,9 +614,9 @@ IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
     N = I;
 
     if (N == 0 && opts::Verbosity >= 1) {
-      outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
-             << BB.getName() << ": failed to meet thresholds after memory "
-             << "profile data was loaded.\n";
+      BC.outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
+                << BB.getName() << ": failed to meet thresholds after memory "
+                << "profile data was loaded.\n";
       return SymTargets;
     }
   }
@@ -974,9 +977,9 @@ size_t IndirectCallPromotion::canPromoteCallsite(
   if (Targets.empty() || !NumCalls) {
     if (opts::Verbosity >= 1) {
       const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-      outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in "
-             << BB.getName() << ", calls = " << NumCalls
-             << ", targets empty or NumCalls == 0.\n";
+      BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                << " in " << BB.getName() << ", calls = " << NumCalls
+                << ", targets empty or NumCalls == 0.\n";
     }
     return 0;
   }
@@ -1015,10 +1018,10 @@ size_t IndirectCallPromotion::canPromoteCallsite(
     if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
       if (opts::Verbosity >= 1) {
         const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-        outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
-               << " in " << BB.getName() << ", calls = " << NumCalls
-               << ", top N mis. frequency " << format("%.1f", TopNFrequency)
-               << "% < " << opts::ICPMispredictThreshold << "%\n";
+        BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                  << " in " << BB.getName() << ", calls = " << NumCalls
+                  << ", top N mis. frequency " << format("%.1f", TopNFrequency)
+                  << "% < " << opts::ICPMispredictThreshold << "%\n";
       }
       return 0;
     }
@@ -1061,11 +1064,11 @@ size_t IndirectCallPromotion::canPromoteCallsite(
       if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
         if (opts::Verbosity >= 1) {
           const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
-          outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
-                 << " in " << BB.getName() << ", calls = " << NumCalls
-                 << ", top N mispredict frequency "
-                 << format("%.1f", TopNMispredictFrequency) << "% < "
-                 << opts::ICPMispredictThreshold << "%\n";
+          BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
+                    << " in " << BB.getName() << ", calls = " << NumCalls
+                    << ", top N mispredict frequency "
+                    << format("%.1f", TopNMispredictFrequency) << "% < "
+                    << opts::ICPMispredictThreshold << "%\n";
         }
         return 0;
       }
@@ -1106,29 +1109,29 @@ void IndirectCallPromotion::printCallsiteInfo(
   const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
   const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
 
-  outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
-         << " @ " << InstIdx << " in " << BB.getName()
-         << " -> calls = " << NumCalls
-         << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
-         << "\n";
+  BC.outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
+            << " @ " << InstIdx << " in " << BB.getName()
+            << " -> calls = " << NumCalls
+            << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
+            << "\n";
   for (size_t I = 0; I < N; I++) {
     const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
     const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
-    outs() << "BOLT-INFO:   ";
+    BC.outs() << "BOLT-INFO:   ";
     if (Targets[I].To.Sym)
-      outs() << Targets[I].To.Sym->getName();
+      BC.outs() << Targets[I].To.Sym->getName();
     else
-      outs() << Targets[I].To.Addr;
-    outs() << ", calls = " << Targets[I].Branches
-           << ", mispreds = " << Targets[I].Mispreds
-           << ", taken freq = " << format("%.1f", Frequency) << "%"
-           << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
+      BC.outs() << Targets[I].To.Addr;
+    BC.outs() << ", calls = " << Targets[I].Branches
+              << ", mispreds = " << Targets[I].Mispreds
+              << ", taken freq = " << format("%.1f", Frequency) << "%"
+              << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
     bool First = true;
     for (uint64_t JTIndex : Targets[I].JTIndices) {
-      outs() << (First ? ", indices = " : ", ") << JTIndex;
+      BC.outs() << (First ? ", indices = " : ", ") << JTIndex;
       First = false;
     }
-    outs() << "\n";
+    BC.outs() << "\n";
   }
 
   LLVM_DEBUG({
@@ -1137,9 +1140,9 @@ void IndirectCallPromotion::printCallsiteInfo(
   });
 }
 
-void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
+Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
   if (opts::ICP == ICP_NONE)
-    return;
+    return Error::success();
 
   auto &BFs = BC.getBinaryFunctions();
 
@@ -1222,9 +1225,9 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
       Functions.insert(std::get<2>(IC));
       ++Num;
     }
-    outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
-           << ", " << Num << " callsites cover " << opts::ICPTopCallsites
-           << "% of all indirect calls\n";
+    BC.outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
+              << ", " << Num << " callsites cover " << opts::ICPTopCallsites
+              << "% of all indirect calls\n";
   }
 
   for (BinaryFunction *FuncPtr : Functions) {
@@ -1301,11 +1304,11 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
               Info.getLivenessAnalysis().getStateBefore(Inst);
           if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
             if (opts::Verbosity >= 1)
-              outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                     << InstIdx << " in " << BB->getName()
-                     << ", calls = " << NumCalls
-                     << (State ? ", cannot clobber flags reg.\n"
-                               : ", no liveness data available.\n");
+              BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                        << InstIdx << " in " << BB->getName()
+                        << ", calls = " << NumCalls
+                        << (State ? ", cannot clobber flags reg.\n"
+                                  : ", no liveness data available.\n");
             continue;
           }
         }
@@ -1341,11 +1344,11 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
         if (SymTargets.size() < N) {
           const size_t LastTarget = SymTargets.size();
           if (opts::Verbosity >= 1)
-            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                   << InstIdx << " in " << BB->getName()
-                   << ", calls = " << NumCalls
-                   << ", ICP failed to find target symbol for "
-                   << Targets[LastTarget].To.Sym->getName() << "\n";
+            BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                      << InstIdx << " in " << BB->getName()
+                      << ", calls = " << NumCalls
+                      << ", ICP failed to find target symbol for "
+                      << Targets[LastTarget].To.Sym->getName() << "\n";
           continue;
         }
 
@@ -1374,10 +1377,10 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
 
         if (ICPcode.empty()) {
           if (opts::Verbosity >= 1)
-            outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
-                   << InstIdx << " in " << BB->getName()
-                   << ", calls = " << NumCalls
-                   << ", unable to generate promoted call code.\n";
+            BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
+                      << InstIdx << " in " << BB->getName()
+                      << ", calls = " << NumCalls
+                      << ", unable to generate promoted call code.\n";
           continue;
         }
 
@@ -1410,9 +1413,9 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
           BBs.push_back(MergeBlock);
 
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
-                 << InstIdx << " in " << BB->getName()
-                 << " -> calls = " << NumCalls << "\n";
+          BC.outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
+                    << InstIdx << " in " << BB->getName()
+                    << " -> calls = " << NumCalls << "\n";
 
         if (IsJumpTable)
           ++TotalOptimizedJumpTableCallsites;
@@ -1426,52 +1429,54 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
     TotalIndirectJmps += FuncTotalIndirectJmps;
   }
 
-  outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
-         << TotalIndirectCallsites << "\n"
-         << "BOLT-INFO: ICP total jump table callsites = "
-         << TotalJumpTableCallsites << "\n"
-         << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
-         << "BOLT-INFO: ICP percentage of calls that are indirect = "
-         << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect calls that can be "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalNumFrequentCalls) /
-                               std::max<size_t>(TotalIndirectCalls, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect callsites that are "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
-                               std::max<uint64_t>(TotalIndirectCallsites, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP number of method load elimination candidates = "
-         << TotalMethodLoadEliminationCandidates << "\n"
-         << "BOLT-INFO: ICP percentage of method calls candidates that have "
-            "loads eliminated = "
-         << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
-                               std::max<uint64_t>(
-                                   TotalMethodLoadEliminationCandidates, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of indirect branches that are "
-            "optimized = "
-         << format("%.1f", (100.0 * TotalNumFrequentJmps) /
-                               std::max<uint64_t>(TotalIndirectJmps, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP percentage of jump table callsites that are "
-         << "optimized = "
-         << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
-                               std::max<uint64_t>(TotalJumpTableCallsites, 1))
-         << "%\n"
-         << "BOLT-INFO: ICP number of jump table callsites that can use hot "
-         << "indices = " << TotalIndexBasedCandidates << "\n"
-         << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
-            "indices = "
-         << format("%.1f", (100.0 * TotalIndexBasedJumps) /
-                               std::max<uint64_t>(TotalIndexBasedCandidates, 1))
-         << "%\n";
+  BC.outs()
+      << "BOLT-INFO: ICP total indirect callsites with profile = "
+      << TotalIndirectCallsites << "\n"
+      << "BOLT-INFO: ICP total jump table callsites = "
+      << TotalJumpTableCallsites << "\n"
+      << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
+      << "BOLT-INFO: ICP percentage of calls that are indirect = "
+      << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect calls that can be "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalNumFrequentCalls) /
+                            std::max<size_t>(TotalIndirectCalls, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect callsites that are "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
+                            std::max<uint64_t>(TotalIndirectCallsites, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP number of method load elimination candidates = "
+      << TotalMethodLoadEliminationCandidates << "\n"
+      << "BOLT-INFO: ICP percentage of method calls candidates that have "
+         "loads eliminated = "
+      << format("%.1f",
+                (100.0 * TotalMethodLoadsEliminated) /
+                    std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of indirect branches that are "
+         "optimized = "
+      << format("%.1f", (100.0 * TotalNumFrequentJmps) /
+                            std::max<uint64_t>(TotalIndirectJmps, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP percentage of jump table callsites that are "
+      << "optimized = "
+      << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
+                            std::max<uint64_t>(TotalJumpTableCallsites, 1))
+      << "%\n"
+      << "BOLT-INFO: ICP number of jump table callsites that can use hot "
+      << "indices = " << TotalIndexBasedCandidates << "\n"
+      << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
+         "indices = "
+      << format("%.1f", (100.0 * TotalIndexBasedJumps) /
+                            std::max<uint64_t>(TotalIndexBasedCandidates, 1))
+      << "%\n";
 
 #ifndef NDEBUG
   verifyProfile(BFs);
 #endif
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 8dcb8934f2d2..a3b2017aa32a 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -496,11 +496,11 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
   return DidInlining;
 }
 
-void Inliner::runOnFunctions(BinaryContext &BC) {
+Error Inliner::runOnFunctions(BinaryContext &BC) {
   opts::syncOptions();
 
   if (!opts::inliningEnabled())
-    return;
+    return Error::success();
 
   bool InlinedOnce;
   unsigned NumIters = 0;
@@ -540,10 +540,11 @@ void Inliner::runOnFunctions(BinaryContext &BC) {
   } while (InlinedOnce && NumIters < opts::InlineMaxIters);
 
   if (NumInlinedCallSites)
-    outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at "
-           << NumInlinedCallSites << " call sites in " << NumIters
-           << " iteration(s). Change in binary size: " << TotalInlinedBytes
-           << " bytes.\n";
+    BC.outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at "
+              << NumInlinedCallSites << " call sites in " << NumIters
+              << " iteration(s). Change in binary size: " << TotalInlinedBytes
+              << " bytes.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index e54b0cacc4ca..760ca84b4ef1 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -96,8 +96,8 @@ static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
     for (const MCInst &Inst : BB)
       if (BC.MIB->isAArch64Exclusive(Inst)) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INSTRUMENTER: Function " << Function
-                 << " has exclusive instructions, skip instrumentation\n";
+          BC.outs() << "BOLT-INSTRUMENTER: Function " << Function
+                    << " has exclusive instructions, skip instrumentation\n";
         return true;
       }
 
@@ -526,7 +526,7 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
   FuncDesc->EdgesSet.clear();
 }
 
-void Instrumentation::runOnFunctions(BinaryContext &BC) {
+Error Instrumentation::runOnFunctions(BinaryContext &BC) {
   const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
                                                  /*IsText=*/false,
                                                  /*IsAllocatable=*/true);
@@ -567,17 +567,16 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
 
       ErrorOr<BinarySection &> SetupSection =
           BC.getUniqueSectionByName("I__setup");
-      if (!SetupSection) {
-        llvm::errs() << "Cannot find I__setup section\n";
-        exit(1);
-      }
+      if (!SetupSection)
+        return createFatalBOLTError("Cannot find I__setup section\n");
+
       MCSymbol *Target = BC.registerNameAtAddress(
           "__bolt_instr_setup", SetupSection->getAddress(), 0, 0);
       MCInst NewInst;
       BC.MIB->createCall(NewInst, Target, BC.Ctx.get());
       BB.insertInstruction(BB.begin(), std::move(NewInst));
     } else {
-      llvm::errs() << "BOLT-WARNING: Entry point not found\n";
+      BC.errs() << "BOLT-WARNING: Entry point not found\n";
     }
 
     if (BinaryData *BD = BC.getBinaryDataByName("___GLOBAL_init_65535/1")) {
@@ -586,10 +585,9 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
       BinaryBasicBlock &BB = Ctor->front();
       ErrorOr<BinarySection &> FiniSection =
           BC.getUniqueSectionByName("I__fini");
-      if (!FiniSection) {
-        llvm::errs() << "Cannot find I__fini section\n";
-        exit(1);
-      }
+      if (!FiniSection)
+        return createFatalBOLTError("Cannot find I__fini section");
+
       MCSymbol *Target = BC.registerNameAtAddress(
           "__bolt_instr_fini", FiniSection->getAddress(), 0, 0);
       auto IsLEA = [&BC](const MCInst &Inst) { return BC.MIB->isLEA64r(Inst); };
@@ -598,11 +596,12 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
       LEA->getOperand(4).setExpr(
           MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *BC.Ctx));
     } else {
-      llvm::errs() << "BOLT-WARNING: ___GLOBAL_init_65535 not found\n";
+      BC.errs() << "BOLT-WARNING: ___GLOBAL_init_65535 not found\n";
     }
   }
 
   setupRuntimeLibrary(BC);
+  return Error::success();
 }
 
 void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
@@ -688,32 +687,34 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
 void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
   uint32_t FuncDescSize = Summary->getFDSize();
 
-  outs() << "BOLT-INSTRUMENTER: Number of indirect call site descriptors: "
-         << Summary->IndCallDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of indirect call target descriptors: "
-         << Summary->IndCallTargetDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
-         << Summary->FunctionDescriptions.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of branch counters: " << BranchCounters
-         << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of ST leaf node counters: "
-         << LeafNodeCounters << "\n";
-  outs() << "BOLT-INSTRUMENTER: Number of direct call counters: "
-         << DirectCallCounters << "\n";
-  outs() << "BOLT-INSTRUMENTER: Total number of counters: "
-         << Summary->Counters.size() << "\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of counters: "
-         << (Summary->Counters.size() * 8) << " bytes (static alloc memory)\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
-         << Summary->StringTable.size() << " bytes in file\n";
-  outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
-         << (FuncDescSize +
-             Summary->IndCallDescriptions.size() * sizeof(IndCallDescription) +
-             Summary->IndCallTargetDescriptions.size() *
-                 sizeof(IndCallTargetDescription))
-         << " bytes in file\n";
-  outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
-         << opts::InstrumentationFilename << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of indirect call site descriptors: "
+            << Summary->IndCallDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of indirect call target descriptors: "
+            << Summary->IndCallTargetDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
+            << Summary->FunctionDescriptions.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of branch counters: "
+            << BranchCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of ST leaf node counters: "
+            << LeafNodeCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Number of direct call counters: "
+            << DirectCallCounters << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total number of counters: "
+            << Summary->Counters.size() << "\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of counters: "
+            << (Summary->Counters.size() * 8)
+            << " bytes (static alloc memory)\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
+            << Summary->StringTable.size() << " bytes in file\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
+            << (FuncDescSize +
+                Summary->IndCallDescriptions.size() *
+                    sizeof(IndCallDescription) +
+                Summary->IndCallTargetDescriptions.size() *
+                    sizeof(IndCallTargetDescription))
+            << " bytes in file\n";
+  BC.outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
+            << opts::InstrumentationFilename << "\n";
 
   InstrumentationRuntimeLibrary *RtLibrary =
       static_cast<InstrumentationRuntimeLibrary *>(BC.getRuntimeLibrary());
diff --git a/bolt/lib/Passes/JTFootprintReduction.cpp b/bolt/lib/Passes/JTFootprintReduction.cpp
index d690e4d0c003..fd291f96004d 100644
--- a/bolt/lib/Passes/JTFootprintReduction.cpp
+++ b/bolt/lib/Passes/JTFootprintReduction.cpp
@@ -246,9 +246,9 @@ void JTFootprintReduction::optimizeFunction(BinaryFunction &Function,
         ++I;
 }
 
-void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
+Error JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
   if (opts::JumpTables == JTS_BASIC && BC.HasRelocations)
-    return;
+    return Error::success();
 
   std::unique_ptr<RegAnalysis> RA;
   std::unique_ptr<BinaryFunctionCallGraph> CG;
@@ -272,23 +272,24 @@ void JTFootprintReduction::runOnFunctions(BinaryContext &BC) {
   }
 
   if (TotalJTs == TotalJTsDenied) {
-    outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
-    return;
+    BC.outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n";
+    return Error::success();
   }
 
-  outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
+  BC.outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n";
   if (OptimizedScore)
-    outs() << format("\t   %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
-           << " of dynamic JT entries were reduced.\n";
-  outs() << "\t   " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
-         << " jump tables affected.\n";
-  outs() << "\t   " << IndJmps - IndJmpsDenied << " of " << IndJmps
-         << " indirect jumps to JTs affected.\n";
-  outs() << "\t   " << NumJTsBadMatch
-         << " JTs discarded due to unsupported jump pattern.\n";
-  outs() << "\t   " << NumJTsNoReg
-         << " JTs discarded due to register unavailability.\n";
-  outs() << "\t   " << BytesSaved << " bytes saved.\n";
+    BC.outs() << format("\t   %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore))
+              << " of dynamic JT entries were reduced.\n";
+  BC.outs() << "\t   " << TotalJTs - TotalJTsDenied << " of " << TotalJTs
+            << " jump tables affected.\n";
+  BC.outs() << "\t   " << IndJmps - IndJmpsDenied << " of " << IndJmps
+            << " indirect jumps to JTs affected.\n";
+  BC.outs() << "\t   " << NumJTsBadMatch
+            << " JTs discarded due to unsupported jump pattern.\n";
+  BC.outs() << "\t   " << NumJTsNoReg
+            << " JTs discarded due to register unavailability.\n";
+  BC.outs() << "\t   " << BytesSaved << " bytes saved.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index ded0db2cd30b..c483f70a836e 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -459,13 +459,13 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC,
   return Iter->second;
 }
 
-bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
+Error LongJmpPass::relaxStub(BinaryBasicBlock &StubBB, bool &Modified) {
   const BinaryFunction &Func = *StubBB.getFunction();
   const BinaryContext &BC = Func.getBinaryContext();
   const int Bits = StubBits[&StubBB];
   // Already working with the largest range?
   if (Bits == static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8))
-    return false;
+    return Error::success();
 
   const static int RangeShortJmp = BC.MIB->getShortJmpEncodingSize();
   const static int RangeSingleInstr = BC.MIB->getUncondBranchEncodingSize();
@@ -481,12 +481,12 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
                                                      : TgtAddress - DotAddress;
   // If it fits in one instruction, do not relax
   if (!(PCRelTgtAddress & SingleInstrMask))
-    return false;
+    return Error::success();
 
   // Fits short jmp
   if (!(PCRelTgtAddress & ShortJmpMask)) {
     if (Bits >= RangeShortJmp)
-      return false;
+      return Error::success();
 
     LLVM_DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = "
                       << Twine::utohexstr(PCRelTgtAddress)
@@ -494,22 +494,23 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
                       << "\n");
     relaxStubToShortJmp(StubBB, RealTargetSym);
     StubBits[&StubBB] = RangeShortJmp;
-    return true;
+    Modified = true;
+    return Error::success();
   }
 
   // The long jmp uses absolute address on AArch64
   // So we could not use it for PIC binaries
-  if (BC.isAArch64() && !BC.HasFixedLoadAddress) {
-    errs() << "BOLT-ERROR: Unable to relax stub for PIC binary\n";
-    exit(1);
-  }
+  if (BC.isAArch64() && !BC.HasFixedLoadAddress)
+    return createFatalBOLTError(
+        "BOLT-ERROR: Unable to relax stub for PIC binary\n");
 
   LLVM_DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = "
                     << Twine::utohexstr(PCRelTgtAddress)
                     << " RealTargetSym = " << RealTargetSym->getName() << "\n");
   relaxStubToLongJmp(StubBB, RealTargetSym);
   StubBits[&StubBB] = static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8);
-  return true;
+  Modified = true;
+  return Error::success();
 }
 
 bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
@@ -539,9 +540,8 @@ bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst,
   return PCOffset < MinVal || PCOffset > MaxVal;
 }
 
-bool LongJmpPass::relax(BinaryFunction &Func) {
+Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
   const BinaryContext &BC = Func.getBinaryContext();
-  bool Modified = false;
 
   assert(BC.isAArch64() && "Unsupported arch");
   constexpr int InsnSize = 4; // AArch64
@@ -613,7 +613,8 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
     if (!Stubs[&Func].count(&BB) || !BB.isValid())
       continue;
 
-    Modified |= relaxStub(BB);
+    if (auto E = relaxStub(BB, Modified))
+      return Error(std::move(E));
   }
 
   for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Elmt :
@@ -625,11 +626,11 @@ bool LongJmpPass::relax(BinaryFunction &Func) {
     Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true);
   }
 
-  return Modified;
+  return Error::success();
 }
 
-void LongJmpPass::runOnFunctions(BinaryContext &BC) {
-  outs() << "BOLT-INFO: Starting stub-insertion pass\n";
+Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
+  BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
   std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
   bool Modified;
   uint32_t Iterations = 0;
@@ -639,19 +640,19 @@ void LongJmpPass::runOnFunctions(BinaryContext &BC) {
     tentativeLayout(BC, Sorted);
     updateStubGroups();
     for (BinaryFunction *Func : Sorted) {
-      if (relax(*Func)) {
-        // Don't ruin non-simple functions, they can't afford to have the layout
-        // changed.
-        if (Func->isSimple())
-          Func->fixBranches();
-        Modified = true;
-      }
+      if (auto E = relax(*Func, Modified))
+        return Error(std::move(E));
+      // Don't ruin non-simple functions, they can't afford to have the layout
+      // changed.
+      if (Modified && Func->isSimple())
+        Func->fixBranches();
     }
   } while (Modified);
-  outs() << "BOLT-INFO: Inserted " << NumHotStubs
-         << " stubs in the hot area and " << NumColdStubs
-         << " stubs in the cold area. Shared " << NumSharedStubs
-         << " times, iterated " << Iterations << " times.\n";
+  BC.outs() << "BOLT-INFO: Inserted " << NumHotStubs
+            << " stubs in the hot area and " << NumColdStubs
+            << " stubs in the cold area. Shared " << NumSharedStubs
+            << " times, iterated " << Iterations << " times.\n";
+  return Error::success();
 }
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Passes/LoopInversionPass.cpp b/bolt/lib/Passes/LoopInversionPass.cpp
index f30e1a8f7450..250a971d204c 100644
--- a/bolt/lib/Passes/LoopInversionPass.cpp
+++ b/bolt/lib/Passes/LoopInversionPass.cpp
@@ -84,11 +84,11 @@ bool LoopInversionPass::runOnFunction(BinaryFunction &BF) {
   return IsChanged;
 }
 
-void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
+Error LoopInversionPass::runOnFunctions(BinaryContext &BC) {
   std::atomic<uint64_t> ModifiedFuncCount{0};
   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE ||
       opts::LoopReorder == false)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
     if (runOnFunction(BF))
@@ -103,8 +103,9 @@ void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
       "LoopInversionPass");
 
-  outs() << "BOLT-INFO: " << ModifiedFuncCount
-         << " Functions were reordered by LoopInversionPass\n";
+  BC.outs() << "BOLT-INFO: " << ModifiedFuncCount
+            << " Functions were reordered by LoopInversionPass\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index aec75be84bfa..d0276f22e14e 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -43,9 +43,9 @@ PLT("plt",
 namespace llvm {
 namespace bolt {
 
-void PLTCall::runOnFunctions(BinaryContext &BC) {
+Error PLTCall::runOnFunctions(BinaryContext &BC) {
   if (opts::PLT == OT_NONE)
-    return;
+    return Error::success();
 
   uint64_t NumCallsOptimized = 0;
   for (auto &It : BC.getBinaryFunctions()) {
@@ -80,9 +80,10 @@ void PLTCall::runOnFunctions(BinaryContext &BC) {
 
   if (NumCallsOptimized) {
     BC.RequiresZNow = true;
-    outs() << "BOLT-INFO: " << NumCallsOptimized
-           << " PLT calls in the binary were optimized.\n";
+    BC.outs() << "BOLT-INFO: " << NumCallsOptimized
+              << " PLT calls in the binary were optimized.\n";
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index ee7512d89962..981d1b70af90 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -31,7 +31,7 @@ llvm::cl::opt<bool>
 namespace llvm {
 namespace bolt {
 
-void PatchEntries::runOnFunctions(BinaryContext &BC) {
+Error PatchEntries::runOnFunctions(BinaryContext &BC) {
   if (!opts::ForcePatch) {
     // Mark the binary for patching if we did not create external references
     // for original code in any of functions we are not going to emit.
@@ -42,11 +42,11 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
         });
 
     if (!NeedsPatching)
-      return;
+      return Error::success();
   }
 
   if (opts::Verbosity >= 1)
-    outs() << "BOLT-INFO: patching entries in original code\n";
+    BC.outs() << "BOLT-INFO: patching entries in original code\n";
 
   // Calculate the size of the patch.
   static size_t PatchSize = 0;
@@ -78,8 +78,8 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
                                                   const MCSymbol *Symbol) {
       if (Offset < NextValidByte) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: unable to patch entry point in " << Function
-                 << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
+          BC.outs() << "BOLT-INFO: unable to patch entry point in " << Function
+                    << " at offset 0x" << Twine::utohexstr(Offset) << '\n';
         return false;
       }
 
@@ -89,8 +89,8 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
       NextValidByte = Offset + PatchSize;
       if (NextValidByte > Function.getMaxSize()) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: function " << Function
-                 << " too small to patch its entry point\n";
+          BC.outs() << "BOLT-INFO: function " << Function
+                    << " too small to patch its entry point\n";
         return false;
       }
 
@@ -101,9 +101,9 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
       // We can't change output layout for AArch64 due to LongJmp pass
       if (BC.isAArch64()) {
         if (opts::ForcePatch) {
-          errs() << "BOLT-ERROR: unable to patch entries in " << Function
-                 << "\n";
-          exit(1);
+          BC.errs() << "BOLT-ERROR: unable to patch entries in " << Function
+                    << "\n";
+          return createFatalBOLTError("");
         }
 
         continue;
@@ -111,8 +111,8 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
 
       // If the original function entries cannot be patched, then we cannot
       // safely emit new function body.
-      errs() << "BOLT-WARNING: failed to patch entries in " << Function
-             << ". The function will not be optimized.\n";
+      BC.errs() << "BOLT-WARNING: failed to patch entries in " << Function
+                << ". The function will not be optimized.\n";
       Function.setIgnored();
       continue;
     }
@@ -138,6 +138,7 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) {
 
     Function.setIsPatched(true);
   }
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/RegAnalysis.cpp b/bolt/lib/Passes/RegAnalysis.cpp
index eab16cb09032..9054385c20b5 100644
--- a/bolt/lib/Passes/RegAnalysis.cpp
+++ b/bolt/lib/Passes/RegAnalysis.cpp
@@ -232,11 +232,11 @@ BitVector RegAnalysis::getFunctionClobberList(const BinaryFunction *Func) {
 }
 
 void RegAnalysis::printStats() {
-  outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
-            "treated as clobbering all registers: "
-         << NumFunctionsAllClobber
-         << format(" (%.1lf%% dyn cov)\n",
-                   (100.0 * CountFunctionsAllClobber / CountDenominator));
+  BC.outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively "
+               "treated as clobbering all registers: "
+            << NumFunctionsAllClobber
+            << format(" (%.1lf%% dyn cov)\n",
+                      (100.0 * CountFunctionsAllClobber / CountDenominator));
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 8b9dc9c1fdd5..0becfb4a06a3 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -452,7 +452,7 @@ void RegReAssign::setupConservativePass(
   });
 }
 
-void RegReAssign::runOnFunctions(BinaryContext &BC) {
+Error RegReAssign::runOnFunctions(BinaryContext &BC) {
   RegScore = std::vector<int64_t>(BC.MRI->getNumRegs(), 0);
   RankedRegs = std::vector<size_t>(BC.MRI->getNumRegs(), 0);
 
@@ -480,18 +480,20 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) {
   }
 
   if (FuncsChanged.empty()) {
-    outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
-    return;
+    BC.outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n";
+    return Error::success();
   }
   if (opts::UpdateDebugSections)
-    outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
-           << " Some registers were changed but associated AT_LOCATION for "
-           << "impacted variables were NOT updated! This operation is "
-           << "currently unsupported by BOLT.\n";
-  outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n";
-  outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
-  outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
-  outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
+    BC.outs()
+        << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections."
+        << " Some registers were changed but associated AT_LOCATION for "
+        << "impacted variables were NOT updated! This operation is "
+        << "currently unsupported by BOLT.\n";
+  BC.outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n";
+  BC.outs() << "\t   " << FuncsChanged.size() << " functions affected.\n";
+  BC.outs() << "\t   " << StaticBytesSaved << " static bytes saved.\n";
+  BC.outs() << "\t   " << DynBytesSaved << " dynamic bytes saved.\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp
index 3a6654cf1e0b..2b04361c9463 100644
--- a/bolt/lib/Passes/ReorderData.cpp
+++ b/bolt/lib/Passes/ReorderData.cpp
@@ -133,7 +133,7 @@ bool filterSymbol(const BinaryData *BD) {
 
 using DataOrder = ReorderData::DataOrder;
 
-void ReorderData::printOrder(const BinarySection &Section,
+void ReorderData::printOrder(BinaryContext &BC, const BinarySection &Section,
                              DataOrder::const_iterator Begin,
                              DataOrder::const_iterator End) const {
   uint64_t TotalSize = 0;
@@ -142,19 +142,20 @@ void ReorderData::printOrder(const BinarySection &Section,
     const BinaryData *BD = Begin->first;
 
     if (!PrintHeader) {
-      outs() << "BOLT-INFO: Hot global symbols for " << Section.getName()
-             << ":\n";
+      BC.outs() << "BOLT-INFO: Hot global symbols for " << Section.getName()
+                << ":\n";
       PrintHeader = true;
     }
 
-    outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable()
-           << format(", weight=%.5f\n", double(Begin->second) / BD->getSize());
+    BC.outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable()
+              << format(", weight=%.5f\n",
+                        double(Begin->second) / BD->getSize());
 
     TotalSize += BD->getSize();
     ++Begin;
   }
   if (TotalSize)
-    outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n";
+    BC.outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n";
 }
 
 DataOrder ReorderData::baseOrder(BinaryContext &BC,
@@ -208,19 +209,19 @@ void ReorderData::assignMemData(BinaryContext &BC) {
   }
 
   if (!Counts.empty()) {
-    outs() << "BOLT-INFO: Memory stats breakdown:\n";
+    BC.outs() << "BOLT-INFO: Memory stats breakdown:\n";
     for (const auto &KV : Counts) {
       StringRef Section = KV.first;
       const uint64_t Count = KV.second;
-      outs() << "BOLT-INFO:   " << Section << " = " << Count
-             << format(" (%.1f%%)\n", 100.0 * Count / TotalCount);
+      BC.outs() << "BOLT-INFO:   " << Section << " = " << Count
+                << format(" (%.1f%%)\n", 100.0 * Count / TotalCount);
       if (JumpTableCounts.count(Section) != 0) {
         const uint64_t JTCount = JumpTableCounts[Section];
-        outs() << "BOLT-INFO:     jump tables = " << JTCount
-               << format(" (%.1f%%)\n", 100.0 * JTCount / Count);
+        BC.outs() << "BOLT-INFO:     jump tables = " << JTCount
+                  << format(" (%.1f%%)\n", 100.0 * JTCount / Count);
       }
     }
-    outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n";
+    BC.outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n";
   }
 }
 
@@ -395,9 +396,9 @@ void ReorderData::setSectionOrder(BinaryContext &BC,
 
   OutputSection.reorderContents(NewOrder, opts::ReorderInplace);
 
-  outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount
-         << format(" (%.1f%%)", 100.0 * Count / TotalCount) << " events, "
-         << Offset << " hot bytes\n";
+  BC.outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount
+            << format(" (%.1f%%)", 100.0 * Count / TotalCount) << " events, "
+            << Offset << " hot bytes\n";
 }
 
 bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
@@ -435,17 +436,17 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
   return FoundUnmoveable;
 }
 
-void ReorderData::runOnFunctions(BinaryContext &BC) {
+Error ReorderData::runOnFunctions(BinaryContext &BC) {
   static const char *DefaultSections[] = {".rodata", ".data", ".bss", nullptr};
 
   if (!BC.HasRelocations || opts::ReorderData.empty())
-    return;
+    return Error::success();
 
   // For now
   if (opts::JumpTables > JTS_BASIC) {
-    outs() << "BOLT-WARNING: jump table support must be basic for "
-           << "data reordering to work.\n";
-    return;
+    BC.outs() << "BOLT-WARNING: jump table support must be basic for "
+              << "data reordering to work.\n";
+    return Error::success();
   }
 
   assignMemData(BC);
@@ -463,14 +464,14 @@ void ReorderData::runOnFunctions(BinaryContext &BC) {
 
     ErrorOr<BinarySection &> Section = BC.getUniqueSectionByName(SectionName);
     if (!Section) {
-      outs() << "BOLT-WARNING: Section " << SectionName
-             << " not found, skipping.\n";
+      BC.outs() << "BOLT-WARNING: Section " << SectionName
+                << " not found, skipping.\n";
       continue;
     }
 
     if (!isSupported(*Section)) {
-      outs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n";
-      exit(1);
+      BC.errs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n";
+      return createFatalBOLTError("");
     }
 
     Sections.push_back(&*Section);
@@ -483,23 +484,23 @@ void ReorderData::runOnFunctions(BinaryContext &BC) {
     unsigned SplitPointIdx;
 
     if (opts::ReorderAlgorithm == opts::ReorderAlgo::REORDER_COUNT) {
-      outs() << "BOLT-INFO: reorder-sections: ordering data by count\n";
+      BC.outs() << "BOLT-INFO: reorder-sections: ordering data by count\n";
       std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section);
     } else {
-      outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
+      BC.outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n";
       std::tie(Order, SplitPointIdx) =
           sortedByFunc(BC, *Section, BC.getBinaryFunctions());
     }
     auto SplitPoint = Order.begin() + SplitPointIdx;
 
     if (opts::PrintReorderedData)
-      printOrder(*Section, Order.begin(), SplitPoint);
+      printOrder(BC, *Section, Order.begin(), SplitPoint);
 
     if (!opts::ReorderInplace || FoundUnmoveable) {
       if (opts::ReorderInplace && FoundUnmoveable)
-        outs() << "BOLT-INFO: Found unmoveable symbols in "
-               << Section->getName() << " falling back to splitting "
-               << "instead of in-place reordering.\n";
+        BC.outs() << "BOLT-INFO: Found unmoveable symbols in "
+                  << Section->getName() << " falling back to splitting "
+                  << "instead of in-place reordering.\n";
 
       // Rename sections.
       BinarySection &Hot =
@@ -519,10 +520,12 @@ void ReorderData::runOnFunctions(BinaryContext &BC) {
         }
       }
     } else {
-      outs() << "BOLT-WARNING: Inplace section reordering not supported yet.\n";
+      BC.outs()
+          << "BOLT-WARNING: Inplace section reordering not supported yet.\n";
       setSectionOrder(BC, *Section, Order.begin(), Order.end());
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 2446524c1ab4..c2d540135bda 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -114,7 +114,8 @@ using NodeId = CallGraph::NodeId;
 using Arc = CallGraph::Arc;
 using Node = CallGraph::Node;
 
-void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
+void ReorderFunctions::reorder(BinaryContext &BC,
+                               std::vector<Cluster> &&Clusters,
                                std::map<uint64_t, BinaryFunction> &BFs) {
   std::vector<uint64_t> FuncAddr(Cg.numNodes()); // Just for computing stats
   uint64_t TotalSize = 0;
@@ -139,10 +140,11 @@ void ReorderFunctions::reorder(std::vector<Cluster> &&Clusters,
   if (opts::ReorderFunctions == RT_NONE)
     return;
 
-  printStats(Clusters, FuncAddr);
+  printStats(BC, Clusters, FuncAddr);
 }
 
-void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
+void ReorderFunctions::printStats(BinaryContext &BC,
+                                  const std::vector<Cluster> &Clusters,
                                   const std::vector<uint64_t> &FuncAddr) {
   if (opts::Verbosity == 0) {
 #ifndef NDEBUG
@@ -167,11 +169,11 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
   double TotalCalls4KB = 0;
   double TotalCalls2MB = 0;
   if (PrintDetailed)
-    outs() << "BOLT-INFO: Function reordering page layout\n"
-           << "BOLT-INFO: ============== page 0 ==============\n";
+    BC.outs() << "BOLT-INFO: Function reordering page layout\n"
+              << "BOLT-INFO: ============== page 0 ==============\n";
   for (const Cluster &Cluster : Clusters) {
     if (PrintDetailed)
-      outs() << format(
+      BC.outs() << format(
           "BOLT-INFO: -------- density = %.3lf (%u / %u) --------\n",
           Cluster.density(), Cluster.samples(), Cluster.size());
 
@@ -180,8 +182,8 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
         Hotfuncs++;
 
         if (PrintDetailed)
-          outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId) << " ("
-                 << Cg.size(FuncId) << ")\n";
+          BC.outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId)
+                    << " (" << Cg.size(FuncId) << ")\n";
 
         uint64_t Dist = 0;
         uint64_t Calls = 0;
@@ -193,12 +195,13 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
                                   (FuncAddr[FuncId] + Arc.avgCallOffset()));
           const double W = Arc.weight();
           if (D < 64 && PrintDetailed && opts::Verbosity > 2)
-            outs() << "BOLT-INFO: short (" << D << "B) call:\n"
-                   << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId) << "\n"
-                   << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
-                   << "BOLT-INFO:   Weight = " << W << "\n"
-                   << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset()
-                   << "\n";
+            BC.outs() << "BOLT-INFO: short (" << D << "B) call:\n"
+                      << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId)
+                      << "\n"
+                      << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
+                      << "BOLT-INFO:   Weight = " << W << "\n"
+                      << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset()
+                      << "\n";
           Calls += W;
           if (D < 64)
             TotalCalls64B += W;
@@ -208,63 +211,64 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
             TotalCalls2MB += W;
           Dist += Arc.weight() * D;
           if (PrintDetailed)
-            outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
-                             "weight = %.0lf, callDist = %f\n",
-                             Arc.src(), FuncAddr[Arc.src()],
-                             Arc.avgCallOffset(), Arc.dst(),
-                             FuncAddr[Arc.dst()], Arc.weight(), D);
+            BC.outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
+                                "weight = %.0lf, callDist = %f\n",
+                                Arc.src(), FuncAddr[Arc.src()],
+                                Arc.avgCallOffset(), Arc.dst(),
+                                FuncAddr[Arc.dst()], Arc.weight(), D);
         }
         TotalCalls += Calls;
         TotalDistance += Dist;
         TotalSize += Cg.size(FuncId);
 
         if (PrintDetailed) {
-          outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : ",
-                           TotalSize, Calls ? Dist / Calls : 0)
-                 << Cg.nodeIdToFunc(FuncId)->getPrintName() << '\n';
+          BC.outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : ",
+                              TotalSize, Calls ? Dist / Calls : 0)
+                    << Cg.nodeIdToFunc(FuncId)->getPrintName() << '\n';
           const uint64_t NewPage = TotalSize / HugePageSize;
           if (NewPage != CurPage) {
             CurPage = NewPage;
-            outs() << format(
+            BC.outs() << format(
                 "BOLT-INFO: ============== page %u ==============\n", CurPage);
           }
         }
       }
     }
   }
-  outs() << "BOLT-INFO: Function reordering stats\n"
-         << format("BOLT-INFO:  Number of hot functions: %u\n"
-                   "BOLT-INFO:  Number of clusters: %lu\n",
-                   Hotfuncs, Clusters.size())
-         << format("BOLT-INFO:  Final average call distance = %.1lf "
-                   "(%.0lf / %.0lf)\n",
-                   TotalCalls ? TotalDistance / TotalCalls : 0, TotalDistance,
-                   TotalCalls)
-         << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
+  BC.outs() << "BOLT-INFO: Function reordering stats\n"
+            << format("BOLT-INFO:  Number of hot functions: %u\n"
+                      "BOLT-INFO:  Number of clusters: %lu\n",
+                      Hotfuncs, Clusters.size())
+            << format("BOLT-INFO:  Final average call distance = %.1lf "
+                      "(%.0lf / %.0lf)\n",
+                      TotalCalls ? TotalDistance / TotalCalls : 0,
+                      TotalDistance, TotalCalls)
+            << format("BOLT-INFO:  Total Calls = %.0lf\n", TotalCalls);
   if (TotalCalls)
-    outs() << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
-                     TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
-           << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
-                     TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
-           << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
-                     TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
+    BC.outs()
+        << format("BOLT-INFO:  Total Calls within 64B = %.0lf (%.2lf%%)\n",
+                  TotalCalls64B, 100 * TotalCalls64B / TotalCalls)
+        << format("BOLT-INFO:  Total Calls within 4KB = %.0lf (%.2lf%%)\n",
+                  TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls)
+        << format("BOLT-INFO:  Total Calls within 2MB = %.0lf (%.2lf%%)\n",
+                  TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls);
 }
 
-std::vector<std::string> ReorderFunctions::readFunctionOrderFile() {
-  std::vector<std::string> FunctionNames;
+Error ReorderFunctions::readFunctionOrderFile(
+    std::vector<std::string> &FunctionNames) {
   std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in);
-  if (!FuncsFile) {
-    errs() << "Ordered functions file \"" << opts::FunctionOrderFile
-           << "\" can't be opened.\n";
-    exit(1);
-  }
+  if (!FuncsFile)
+    return createFatalBOLTError(Twine("Ordered functions file \"") +
+                                Twine(opts::FunctionOrderFile) +
+                                Twine("\" can't be opened."));
+
   std::string FuncName;
   while (std::getline(FuncsFile, FuncName))
     FunctionNames.push_back(FuncName);
-  return FunctionNames;
+  return Error::success();
 }
 
-void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
+Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
   auto &BFs = BC.getBinaryFunctions();
   if (opts::ReorderFunctions != RT_NONE &&
       opts::ReorderFunctions != RT_EXEC_COUNT &&
@@ -373,7 +377,11 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
     uint32_t Index = 0;
     uint32_t InvalidEntries = 0;
-    for (const std::string &Function : readFunctionOrderFile()) {
+    std::vector<std::string> FunctionNames;
+    if (Error E = readFunctionOrderFile(FunctionNames))
+      return Error(std::move(E));
+
+    for (const std::string &Function : FunctionNames) {
       std::vector<uint64_t> FuncAddrs;
 
       BinaryData *BD = BC.getBinaryDataByName(Function);
@@ -399,8 +407,8 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
       if (FuncAddrs.empty()) {
         if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: Reorder functions: can't find function "
-                 << "for " << Function << "\n";
+          BC.errs() << "BOLT-WARNING: Reorder functions: can't find function "
+                    << "for " << Function << "\n";
         ++InvalidEntries;
         continue;
       }
@@ -412,28 +420,28 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
         BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol());
         if (!BF) {
           if (opts::Verbosity >= 1)
-            errs() << "BOLT-WARNING: Reorder functions: can't find function "
-                   << "for " << Function << "\n";
+            BC.errs() << "BOLT-WARNING: Reorder functions: can't find function "
+                      << "for " << Function << "\n";
           ++InvalidEntries;
           break;
         }
         if (!BF->hasValidIndex())
           BF->setIndex(Index++);
         else if (opts::Verbosity > 0)
-          errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
-                 << "\n";
+          BC.errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
+                    << "\n";
       }
     }
     if (InvalidEntries)
-      errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
-             << InvalidEntries << " entries in -function-order list\n";
+      BC.errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
+                << InvalidEntries << " entries in -function-order list\n";
   } break;
 
   default:
     llvm_unreachable("unexpected layout type");
   }
 
-  reorder(std::move(Clusters), BFs);
+  reorder(BC, std::move(Clusters), BFs);
 
   BC.HasFinalizedFunctionOrder = true;
 
@@ -442,9 +450,9 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     FuncsFile = std::make_unique<std::ofstream>(opts::GenerateFunctionOrderFile,
                                                 std::ios::out);
     if (!FuncsFile) {
-      errs() << "BOLT-ERROR: ordered functions file "
-             << opts::GenerateFunctionOrderFile << " cannot be opened\n";
-      exit(1);
+      BC.errs() << "BOLT-ERROR: ordered functions file "
+                << opts::GenerateFunctionOrderFile << " cannot be opened\n";
+      return createFatalBOLTError("");
     }
   }
 
@@ -453,9 +461,9 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     LinkSectionsFile =
         std::make_unique<std::ofstream>(opts::LinkSectionsFile, std::ios::out);
     if (!LinkSectionsFile) {
-      errs() << "BOLT-ERROR: link sections file " << opts::LinkSectionsFile
-             << " cannot be opened\n";
-      exit(1);
+      BC.errs() << "BOLT-ERROR: link sections file " << opts::LinkSectionsFile
+                << " cannot be opened\n";
+      return createFatalBOLTError("");
     }
   }
 
@@ -505,16 +513,17 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
 
     if (FuncsFile) {
       FuncsFile->close();
-      outs() << "BOLT-INFO: dumped function order to "
-             << opts::GenerateFunctionOrderFile << '\n';
+      BC.outs() << "BOLT-INFO: dumped function order to "
+                << opts::GenerateFunctionOrderFile << '\n';
     }
 
     if (LinkSectionsFile) {
       LinkSectionsFile->close();
-      outs() << "BOLT-INFO: dumped linker section order to "
-             << opts::LinkSectionsFile << '\n';
+      BC.outs() << "BOLT-INFO: dumped linker section order to "
+                << opts::LinkSectionsFile << '\n';
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/RetpolineInsertion.cpp b/bolt/lib/Passes/RetpolineInsertion.cpp
index 97eedb882f55..2808575cc6bb 100644
--- a/bolt/lib/Passes/RetpolineInsertion.cpp
+++ b/bolt/lib/Passes/RetpolineInsertion.cpp
@@ -271,9 +271,9 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) {
   }
 }
 
-void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
+Error RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
   if (!opts::InsertRetpolines)
-    return;
+    return Error::success();
 
   assert(BC.isX86() &&
          "retpoline insertion not supported for target architecture");
@@ -327,10 +327,11 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC) {
       }
     }
   }
-  outs() << "BOLT-INFO: The number of created retpoline functions is : "
-         << CreatedRetpolines.size()
-         << "\nBOLT-INFO: The number of retpolined branches is : "
-         << RetpolinedBranches << "\n";
+  BC.outs() << "BOLT-INFO: The number of created retpoline functions is : "
+            << CreatedRetpolines.size()
+            << "\nBOLT-INFO: The number of retpolined branches is : "
+            << RetpolinedBranches << "\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index d7b25c1279dc..9a1f9d72623a 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -1646,9 +1646,9 @@ void ShrinkWrapping::rebuildCFIForSP() {
         ++I;
 }
 
-MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
-                                         const FrameIndexEntry &FIE,
-                                         bool CreatePushOrPop) {
+Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
+                                                   const FrameIndexEntry &FIE,
+                                                   bool CreatePushOrPop) {
   MCInst NewInst;
   if (SPVal != StackPointerTracking::SUPERPOSITION &&
       SPVal != StackPointerTracking::EMPTY) {
@@ -1656,15 +1656,15 @@ MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
       if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
                                           FIE.StackOffset - SPVal, FIE.RegOrImm,
                                           FIE.Size)) {
-        errs() << "createRestoreFromStack: not supported on this platform\n";
-        abort();
+        return createFatalBOLTError(
+            "createRestoreFromStack: not supported on this platform\n");
       }
     } else {
       if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
                                      FIE.StackOffset - SPVal, FIE.RegOrImm,
                                      FIE.Size)) {
-        errs() << "createSaveToStack: not supported on this platform\n";
-        abort();
+        return createFatalBOLTError(
+            "createSaveToStack: not supported on this platform\n");
       }
     }
     if (CreatePushOrPop)
@@ -1678,15 +1678,15 @@ MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
     if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
                                         FIE.StackOffset - FPVal, FIE.RegOrImm,
                                         FIE.Size)) {
-      errs() << "createRestoreFromStack: not supported on this platform\n";
-      abort();
+      return createFatalBOLTError(
+          "createRestoreFromStack: not supported on this platform\n");
     }
   } else {
     if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
                                    FIE.StackOffset - FPVal, FIE.RegOrImm,
                                    FIE.Size)) {
-      errs() << "createSaveToStack: not supported on this platform\n";
-      abort();
+      return createFatalBOLTError(
+          "createSaveToStack: not supported on this platform\n");
     }
   }
   return NewInst;
@@ -1743,10 +1743,11 @@ BBIterTy ShrinkWrapping::insertCFIsForPushOrPop(BinaryBasicBlock &BB,
   return Pos;
 }
 
-BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
-                                          BinaryBasicBlock *CurBB,
-                                          const WorklistItem &Item,
-                                          int64_t SPVal, int64_t FPVal) {
+Expected<BBIterTy> ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
+                                                    BinaryBasicBlock *CurBB,
+                                                    const WorklistItem &Item,
+                                                    int64_t SPVal,
+                                                    int64_t FPVal) {
   // Trigger CFI reconstruction for this CSR if necessary - writing to
   // PushOffsetByReg/PopOffsetByReg *will* trigger CFI update
   if ((Item.FIEToInsert.IsStore &&
@@ -1772,9 +1773,12 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
            << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop)
            << "\n";
   });
-  MCInst NewInst =
+  Expected<MCInst> NewInstOrErr =
       createStackAccess(SPVal, FPVal, Item.FIEToInsert,
                         Item.Action == WorklistItem::InsertPushOrPop);
+  if (auto E = NewInstOrErr.takeError())
+    return Error(std::move(E));
+  MCInst &NewInst = *NewInstOrErr;
   if (InsertionPoint != CurBB->end()) {
     LLVM_DEBUG({
       dbgs() << "Adding before Inst: ";
@@ -1791,7 +1795,7 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
   return CurBB->end();
 }
 
-BBIterTy ShrinkWrapping::processInsertionsList(
+Expected<BBIterTy> ShrinkWrapping::processInsertionsList(
     BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
     std::vector<WorklistItem> &TodoList, int64_t SPVal, int64_t FPVal) {
   bool HasInsertions = llvm::any_of(TodoList, [&](WorklistItem &Item) {
@@ -1840,8 +1844,11 @@ BBIterTy ShrinkWrapping::processInsertionsList(
         Item.Action == WorklistItem::ChangeToAdjustment)
       continue;
 
-    InsertionPoint =
+    auto InsertionPointOrErr =
         processInsertion(InsertionPoint, CurBB, Item, SPVal, FPVal);
+    if (auto E = InsertionPointOrErr.takeError())
+      return Error(std::move(E));
+    InsertionPoint = *InsertionPointOrErr;
     if (Item.Action == WorklistItem::InsertPushOrPop &&
         Item.FIEToInsert.IsStore)
       SPVal -= Item.FIEToInsert.Size;
@@ -1852,7 +1859,7 @@ BBIterTy ShrinkWrapping::processInsertionsList(
   return InsertionPoint;
 }
 
-bool ShrinkWrapping::processInsertions() {
+Expected<bool> ShrinkWrapping::processInsertions() {
   PredictiveStackPointerTracking PSPT(BF, Todo, Info, AllocatorId);
   PSPT.run();
 
@@ -1875,14 +1882,20 @@ bool ShrinkWrapping::processInsertions() {
       auto Iter = I;
       std::pair<int, int> SPTState =
           *PSPT.getStateAt(Iter == BB.begin() ? (ProgramPoint)&BB : &*(--Iter));
-      I = processInsertionsList(I, &BB, List, SPTState.first, SPTState.second);
+      auto IterOrErr =
+          processInsertionsList(I, &BB, List, SPTState.first, SPTState.second);
+      if (auto E = IterOrErr.takeError())
+        return Error(std::move(E));
+      I = *IterOrErr;
     }
     // Process insertions at the end of bb
     auto WRI = Todo.find(&BB);
     if (WRI != Todo.end()) {
       std::pair<int, int> SPTState = *PSPT.getStateAt(*BB.rbegin());
-      processInsertionsList(BB.end(), &BB, WRI->second, SPTState.first,
-                            SPTState.second);
+      if (auto E = processInsertionsList(BB.end(), &BB, WRI->second,
+                                         SPTState.first, SPTState.second)
+                       .takeError())
+        return Error(std::move(E));
       Changes = true;
     }
   }
@@ -1945,7 +1958,7 @@ void ShrinkWrapping::rebuildCFI() {
   }
 }
 
-bool ShrinkWrapping::perform(bool HotOnly) {
+Expected<bool> ShrinkWrapping::perform(bool HotOnly) {
   HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
   PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
   PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
@@ -1998,7 +2011,11 @@ bool ShrinkWrapping::perform(bool HotOnly) {
   });
   SLM.performChanges();
   // Early exit if processInsertions doesn't detect any todo items
-  if (!processInsertions())
+  auto ModifiedOrErr = processInsertions();
+  if (auto E = ModifiedOrErr.takeError())
+    return Error(std::move(E));
+  const bool Modified = *ModifiedOrErr;
+  if (!Modified)
     return false;
   processDeletions();
   if (foldIdenticalSplitEdges()) {
@@ -2018,28 +2035,28 @@ bool ShrinkWrapping::perform(bool HotOnly) {
   return true;
 }
 
-void ShrinkWrapping::printStats() {
-  outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
-         << " spills inserting load/stores and " << SpillsMovedPushPopMode
-         << " spills inserting push/pops\n";
+void ShrinkWrapping::printStats(BinaryContext &BC) {
+  BC.outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
+            << " spills inserting load/stores and " << SpillsMovedPushPopMode
+            << " spills inserting push/pops\n";
   if (!InstrDynamicCount || !StoreDynamicCount)
     return;
-  outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
-         << " store executions ("
-         << format("%.1lf%%",
-                   (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
-         << " total instructions executed, "
-         << format("%.1lf%%",
-                   (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
-         << " store instructions)\n";
-  outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
-         << SpillsFailedDynamicCount << " store executions ("
-         << format("%.1lf%%",
-                   (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
-         << " total instructions executed, "
-         << format("%.1lf%%",
-                   (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
-         << " store instructions)\n";
+  BC.outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
+            << " store executions ("
+            << format("%.1lf%%",
+                      (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
+            << " total instructions executed, "
+            << format("%.1lf%%",
+                      (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
+            << " store instructions)\n";
+  BC.outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
+            << SpillsFailedDynamicCount << " store executions ("
+            << format("%.1lf%%",
+                      (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
+            << " total instructions executed, "
+            << format("%.1lf%%",
+                      (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
+            << " store instructions)\n";
 }
 
 // Operators necessary as a result of using MCAnnotation
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 5de075973004..cdbb2a15f667 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -712,15 +712,15 @@ bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
   return BinaryFunctionPass::shouldOptimize(BF);
 }
 
-void SplitFunctions::runOnFunctions(BinaryContext &BC) {
+Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
   if (!opts::SplitFunctions)
-    return;
+    return Error::success();
 
   // If split strategy is not CDSplit, then a second run of the pass is not
   // needed after function reordering.
   if (BC.HasFinalizedFunctionOrder &&
       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
-    return;
+    return Error::success();
 
   std::unique_ptr<SplitStrategy> Strategy;
   bool ForceSequential = false;
@@ -766,10 +766,12 @@ void SplitFunctions::runOnFunctions(BinaryContext &BC) {
       "SplitFunctions", ForceSequential);
 
   if (SplitBytesHot + SplitBytesCold > 0)
-    outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
-           << " hot bytes from " << SplitBytesCold << " cold bytes "
-           << format("(%.2lf%% of split functions is hot).\n",
-                     100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+    BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
+              << " hot bytes from " << SplitBytesCold << " cold bytes "
+              << format("(%.2lf%% of split functions is hot).\n",
+                        100.0 * SplitBytesHot /
+                            (SplitBytesHot + SplitBytesCold));
+  return Error::success();
 }
 
 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
@@ -899,9 +901,9 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
       if (opts::Verbosity >= 2) {
-        outs() << "BOLT-INFO: Reversing splitting of function "
-               << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
-                          ColdSize, OriginalHotSize);
+        BC.outs() << "BOLT-INFO: Reversing splitting of function "
+                  << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
+                             ColdSize, OriginalHotSize);
       }
 
       // Reverse the action of createEHTrampolines(). The trampolines will be
diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp
index 419ba236e134..499cac421709 100644
--- a/bolt/lib/Passes/StokeInfo.cpp
+++ b/bolt/lib/Passes/StokeInfo.cpp
@@ -97,7 +97,8 @@ bool StokeInfo::checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
 
   if (!BF.isSimple() || BF.isMultiEntry() || BF.empty())
     return false;
-  outs() << " STOKE-INFO: analyzing function " << Name << "\n";
+  BF.getBinaryContext().outs()
+      << " STOKE-INFO: analyzing function " << Name << "\n";
 
   FuncInfo.FuncName = Name;
   FuncInfo.Offset = BF.getFileOffset();
@@ -140,19 +141,19 @@ bool StokeInfo::checkFunction(BinaryFunction &BF, DataflowInfoManager &DInfo,
   LiveOutBV &= DefaultLiveOutMask;
   getRegNameFromBitVec(BF.getBinaryContext(), LiveOutBV, &FuncInfo.LiveOut);
 
-  outs() << " STOKE-INFO: end function \n";
+  BF.getBinaryContext().outs() << " STOKE-INFO: end function \n";
   return true;
 }
 
-void StokeInfo::runOnFunctions(BinaryContext &BC) {
-  outs() << "STOKE-INFO: begin of stoke pass\n";
+Error StokeInfo::runOnFunctions(BinaryContext &BC) {
+  BC.outs() << "STOKE-INFO: begin of stoke pass\n";
 
   std::ofstream Outfile;
   if (!opts::StokeOutputDataFilename.empty()) {
     Outfile.open(opts::StokeOutputDataFilename);
   } else {
-    errs() << "STOKE-INFO: output file is required\n";
-    return;
+    BC.errs() << "STOKE-INFO: output file is required\n";
+    return Error::success();
   }
 
   // check some context meta data
@@ -185,7 +186,8 @@ void StokeInfo::runOnFunctions(BinaryContext &BC) {
       FuncInfo.printData(Outfile);
   }
 
-  outs() << "STOKE-INFO: end of stoke pass\n";
+  BC.outs() << "STOKE-INFO: end of stoke pass\n";
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp
index e63d4be1392f..2163e3a6a008 100644
--- a/bolt/lib/Passes/TailDuplication.cpp
+++ b/bolt/lib/Passes/TailDuplication.cpp
@@ -633,9 +633,9 @@ void TailDuplication::runOnFunction(BinaryFunction &Function) {
     ModifiedFunctions++;
 }
 
-void TailDuplication::runOnFunctions(BinaryContext &BC) {
+Error TailDuplication::runOnFunctions(BinaryContext &BC) {
   if (opts::TailDuplicationMode == TailDuplication::TD_NONE)
-    return;
+    return Error::success();
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
@@ -644,23 +644,26 @@ void TailDuplication::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: tail duplication"
-         << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
-                   100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
-         << format(" duplicated %zu blocks (%zu bytes) responsible for",
-                   DuplicatedBlockCount, DuplicatedByteCount)
-         << format(" %zu dynamic executions (%.2f%% of all block executions)",
-                   DuplicationsDynamicCount,
-                   100.0 * DuplicationsDynamicCount / AllDynamicCount)
-         << "\n";
+  BC.outs()
+      << "BOLT-INFO: tail duplication"
+      << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
+                100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
+      << format(" duplicated %zu blocks (%zu bytes) responsible for",
+                DuplicatedBlockCount, DuplicatedByteCount)
+      << format(" %zu dynamic executions (%.2f%% of all block executions)",
+                DuplicationsDynamicCount,
+                100.0 * DuplicationsDynamicCount / AllDynamicCount)
+      << "\n";
 
   if (opts::TailDuplicationConstCopyPropagation) {
-    outs() << "BOLT-INFO: tail duplication "
-           << format("applied %zu static and %zu dynamic propagation deletions",
+    BC.outs() << "BOLT-INFO: tail duplication "
+              << format(
+                     "applied %zu static and %zu dynamic propagation deletions",
                      StaticInstructionDeletionCount,
                      DynamicInstructionDeletionCount)
-           << "\n";
+              << "\n";
   }
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/ThreeWayBranch.cpp b/bolt/lib/Passes/ThreeWayBranch.cpp
index dc320d53fb68..c69eac5614b9 100644
--- a/bolt/lib/Passes/ThreeWayBranch.cpp
+++ b/bolt/lib/Passes/ThreeWayBranch.cpp
@@ -147,7 +147,7 @@ void ThreeWayBranch::runOnFunction(BinaryFunction &Function) {
   }
 }
 
-void ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
+Error ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &Function = It.second;
     if (!shouldRunOnFunction(Function))
@@ -155,8 +155,9 @@ void ThreeWayBranch::runOnFunctions(BinaryContext &BC) {
     runOnFunction(Function);
   }
 
-  outs() << "BOLT-INFO: number of three way branches order changed: "
-         << BranchesAltered << "\n";
+  BC.outs() << "BOLT-INFO: number of three way branches order changed: "
+            << BranchesAltered << "\n";
+  return Error::success();
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp
index 516f91acb508..54ae621159cf 100644
--- a/bolt/lib/Passes/ValidateInternalCalls.cpp
+++ b/bolt/lib/Passes/ValidateInternalCalls.cpp
@@ -302,9 +302,9 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
   return true;
 }
 
-void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
+Error ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   // Look for functions that need validation. This should be pretty rare.
   std::set<BinaryFunction *> NeedsValidation;
@@ -323,7 +323,7 @@ void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
 
   // Skip validation for non-relocation mode
   if (!BC.HasRelocations)
-    return;
+    return Error::success();
 
   // Since few functions need validation, we can work with our most expensive
   // algorithms here. Fix the CFG treating internal calls as unconditional
@@ -339,13 +339,15 @@ void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
   }
 
   if (!Invalid.empty()) {
-    errs() << "BOLT-WARNING: will skip the following function(s) as unsupported"
-              " internal calls were detected:\n";
+    BC.errs()
+        << "BOLT-WARNING: will skip the following function(s) as unsupported"
+           " internal calls were detected:\n";
     for (BinaryFunction *Function : Invalid) {
-      errs() << "              " << *Function << "\n";
+      BC.errs() << "              " << *Function << "\n";
       Function->setIgnored();
     }
   }
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/ValidateMemRefs.cpp b/bolt/lib/Passes/ValidateMemRefs.cpp
index 3324776830d1..f29a97c43f49 100644
--- a/bolt/lib/Passes/ValidateMemRefs.cpp
+++ b/bolt/lib/Passes/ValidateMemRefs.cpp
@@ -72,13 +72,13 @@ void ValidateMemRefs::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-void ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
+Error ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
   if (!BC.isX86())
-    return;
+    return Error::success();
 
   // Skip validation if not moving JT
   if (opts::JumpTables == JTS_NONE || opts::JumpTables == JTS_BASIC)
-    return;
+    return Error::success();
 
   ParallelUtilities::WorkFuncWithAllocTy ProcessFunction =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
@@ -94,10 +94,11 @@ void ValidateMemRefs::runOnFunctions(BinaryContext &BC) {
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: memrefs validation is concluded\n");
 
   if (!ReplacedReferences)
-    return;
+    return Error::success();
 
-  outs() << "BOLT-INFO: validate-mem-refs updated " << ReplacedReferences
-         << " object references\n";
+  BC.outs() << "BOLT-INFO: validate-mem-refs updated " << ReplacedReferences
+            << " object references\n";
+  return Error::success();
 }
 
 } // namespace llvm::bolt
diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp
index 929c7360b7ff..0bec11128c7c 100644
--- a/bolt/lib/Passes/VeneerElimination.cpp
+++ b/bolt/lib/Passes/VeneerElimination.cpp
@@ -29,9 +29,9 @@ static llvm::cl::opt<bool>
 namespace llvm {
 namespace bolt {
 
-void VeneerElimination::runOnFunctions(BinaryContext &BC) {
+Error VeneerElimination::runOnFunctions(BinaryContext &BC) {
   if (!opts::EliminateVeneers || !BC.isAArch64())
-    return;
+    return Error::success();
 
   std::map<uint64_t, BinaryFunction> &BFs = BC.getBinaryFunctions();
   std::unordered_map<const MCSymbol *, const MCSymbol *> VeneerDestinations;
@@ -51,8 +51,8 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
       VeneerDestinations[Symbol] = VeneerTargetSymbol;
   }
 
-  outs() << "BOLT-INFO: number of removed linker-inserted veneers: "
-         << VeneersCount << "\n";
+  BC.outs() << "BOLT-INFO: number of removed linker-inserted veneers: "
+            << VeneersCount << "\n";
 
   // Handle veneers to veneers in case they occur
   for (auto &Entry : VeneerDestinations) {
@@ -79,8 +79,8 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
         VeneerCallers++;
         if (!BC.MIB->replaceBranchTarget(
                 Instr, VeneerDestinations[TargetSymbol], BC.Ctx.get())) {
-          errs() << "BOLT-ERROR: updating veneer call destination failed\n";
-          exit(1);
+          return createFatalBOLTError(
+              "BOLT-ERROR: updating veneer call destination failed\n");
         }
       }
     }
@@ -90,6 +90,7 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
       dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites: "
              << VeneerCallers << "\n");
   (void)VeneerCallers;
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 6901e482ddfd..8e18b21f696a 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -108,7 +108,7 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   writeMaps</*Cold=*/false>(Maps, PrevAddress, OS);
   writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
 
-  outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
 }
 
 APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
@@ -201,7 +201,7 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
   }
 }
 
-std::error_code BoltAddressTranslation::parse(StringRef Buf) {
+std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) {
   DataExtractor DE = DataExtractor(Buf, true, 8);
   uint64_t Offset = 0;
   if (Buf.size() < 12)
@@ -225,7 +225,7 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
   uint64_t PrevAddress = 0;
   parseMaps</*Cold=*/false>(HotFuncs, PrevAddress, DE, Offset, Err);
   parseMaps</*Cold=*/true>(HotFuncs, PrevAddress, DE, Offset, Err);
-  outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
+  OS << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
   return errorToErrorCode(std::move(Err));
 }
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 9946608c96d8..489b33fe1c7c 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -268,7 +268,7 @@ const char BinaryFunctionPassManager::TimerGroupName[] = "passman";
 const char BinaryFunctionPassManager::TimerGroupDesc[] =
     "Binary Function Pass Manager";
 
-void BinaryFunctionPassManager::runPasses() {
+Error BinaryFunctionPassManager::runPasses() {
   auto &BFs = BC.getBinaryFunctions();
   for (size_t PassIdx = 0; PassIdx < Passes.size(); PassIdx++) {
     const std::pair<const bool, std::unique_ptr<BinaryFunctionPass>>
@@ -281,13 +281,20 @@ void BinaryFunctionPassManager::runPasses() {
         formatv("{0:2}_{1}", PassIdx, Pass->getName()).str();
 
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
+      BC.outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n";
 
     NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName,
                        TimerGroupDesc, TimeOpts);
 
-    callWithDynoStats([this, &Pass] { Pass->runOnFunctions(BC); }, BFs,
-                      Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
+    Error E = Error::success();
+    callWithDynoStats(
+        BC.outs(),
+        [this, &E, &Pass] {
+          E = joinErrors(std::move(E), Pass->runOnFunctions(BC));
+        },
+        BFs, Pass->getName(), opts::DynoStatsAll, BC.isAArch64());
+    if (E)
+      return Error(std::move(E));
 
     if (opts::VerifyCFG &&
         !std::accumulate(
@@ -296,13 +303,13 @@ void BinaryFunctionPassManager::runPasses() {
                const std::pair<const uint64_t, BinaryFunction> &It) {
               return Valid && It.second.validateCFG();
             })) {
-      errs() << "BOLT-ERROR: Invalid CFG detected after pass "
-             << Pass->getName() << "\n";
-      exit(1);
+      return createFatalBOLTError(
+          Twine("BOLT-ERROR: Invalid CFG detected after pass ") +
+          Twine(Pass->getName()) + Twine("\n"));
     }
 
     if (opts::Verbosity > 0)
-      outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
+      BC.outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n";
 
     if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass())
       continue;
@@ -315,15 +322,16 @@ void BinaryFunctionPassManager::runPasses() {
       if (!Pass->shouldPrint(Function))
         continue;
 
-      Function.print(outs(), Message);
+      Function.print(BC.outs(), Message);
 
       if (opts::DumpDotAll)
         Function.dumpGraphForPass(PassIdName);
     }
   }
+  return Error::success();
 }
 
-void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
+Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   BinaryFunctionPassManager Manager(BC);
 
   const DynoStats InitialDynoStats =
@@ -516,7 +524,7 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   // in parallel and restore them
   Manager.registerPass(std::make_unique<CleanMCState>(NeverPrint));
 
-  Manager.runPasses();
+  return Manager.runPasses();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Rewrite/BoltDiff.cpp b/bolt/lib/Rewrite/BoltDiff.cpp
index 16a90510962e..fa43b7a2f92c 100644
--- a/bolt/lib/Rewrite/BoltDiff.cpp
+++ b/bolt/lib/Rewrite/BoltDiff.cpp
@@ -294,9 +294,9 @@ class RewriteInstanceDiff {
     }
     PrintProgramStats PPS(opts::NeverPrint);
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n";
-    PPS.runOnFunctions(*RI1.BC);
+    RI1.BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*RI1.BC));
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n";
-    PPS.runOnFunctions(*RI2.BC);
+    RI1.BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*RI2.BC));
     outs() << "=====\n";
     outs() << "Inputs share " << BothHaveProfile
            << " functions with valid profile.\n";
@@ -700,9 +700,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) {
   if (opts::ICF) {
     IdenticalCodeFolding ICF(opts::NeverPrint);
     outs() << "BOLT-DIFF: Starting ICF pass for binary 1";
-    ICF.runOnFunctions(*BC);
+    BC->logBOLTErrorsAndQuitOnFatal(ICF.runOnFunctions(*BC));
     outs() << "BOLT-DIFF: Starting ICF pass for binary 2";
-    ICF.runOnFunctions(*RI2.BC);
+    BC->logBOLTErrorsAndQuitOnFatal(ICF.runOnFunctions(*RI2.BC));
   }
 
   RewriteInstanceDiff RID(*this, RI2);
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index cefccbdfa2f0..27fa937c7508 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -709,7 +709,7 @@ void DWARFRewriter::updateDebugInfo() {
                                 : LegacyRangesSectionWriter.get();
     // Skipping CUs that failed to load.
     if (SplitCU) {
-      DIEBuilder DWODIEBuilder(&(*SplitCU)->getContext(), true);
+      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), true);
       DWODIEBuilder.buildDWOUnit(**SplitCU);
       std::string DWOName = updateDWONameCompDir(
           *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
@@ -754,7 +754,7 @@ void DWARFRewriter::updateDebugInfo() {
     AddrWriter->update(*DIEBlder, *Unit);
   };
 
-  DIEBuilder DIEBlder(BC.DwCtx.get());
+  DIEBuilder DIEBlder(BC, BC.DwCtx.get());
   DIEBlder.buildTypeUnits(StrOffstsWriter.get());
   SmallVector<char, 20> OutBuffer;
   std::unique_ptr<raw_svector_ostream> ObjOS =
@@ -1655,7 +1655,8 @@ createDwarfOnlyBC(const object::ObjectFile &File) {
       &File, false,
       DWARFContext::create(File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, "", WithColor::defaultErrorHandler,
-                           WithColor::defaultWarningHandler)));
+                           WithColor::defaultWarningHandler),
+      {llvm::outs(), llvm::errs()}));
 }
 
 StringMap<DWARFRewriter::KnownSectionsEntry>
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index 8be8257f15c1..0970a0507ebe 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -103,7 +103,8 @@ MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
     : InputFile(InputFile), ToolPath(ToolPath) {
   ErrorAsOutParameter EAO(&Err);
   auto BCOrErr = BinaryContext::createBinaryContext(
-      InputFile, /* IsPIC */ true, DWARFContext::create(*InputFile));
+      InputFile, /* IsPIC */ true, DWARFContext::create(*InputFile),
+      {llvm::outs(), llvm::errs()});
   if (Error E = BCOrErr.takeError()) {
     Err = std::move(E);
     return;
@@ -337,7 +338,7 @@ void MachORewriteInstance::disassembleFunctions() {
     BinaryFunction &Function = BFI.second;
     if (!Function.isSimple())
       continue;
-    Function.disassemble();
+    BC->logBOLTErrorsAndQuitOnFatal(Function.disassemble());
     if (opts::PrintDisasm)
       Function.print(outs(), "after disassembly");
   }
@@ -348,10 +349,7 @@ void MachORewriteInstance::buildFunctionsCFG() {
     BinaryFunction &Function = BFI.second;
     if (!Function.isSimple())
       continue;
-    if (!Function.buildCFG(/*AllocId*/ 0)) {
-      errs() << "BOLT-WARNING: failed to build CFG for the function "
-             << Function << "\n";
-    }
+    BC->logBOLTErrorsAndQuitOnFatal(Function.buildCFG(/*AllocId*/ 0));
   }
 }
 
@@ -387,7 +385,7 @@ void MachORewriteInstance::runOptimizationPasses() {
   Manager.registerPass(
       std::make_unique<FinalizeFunctions>(opts::PrintFinalized));
 
-  Manager.runPasses();
+  BC->logBOLTErrorsAndQuitOnFatal(Manager.runPasses());
 }
 
 void MachORewriteInstance::mapInstrumentationSection(
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 9a242d94dd77..89ca13c427dc 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -309,9 +309,11 @@ bool refersToReorderedSection(ErrorOr<BinarySection &> Section) {
 
 Expected<std::unique_ptr<RewriteInstance>>
 RewriteInstance::create(ELFObjectFileBase *File, const int Argc,
-                        const char *const *Argv, StringRef ToolPath) {
+                        const char *const *Argv, StringRef ToolPath,
+                        raw_ostream &Stdout, raw_ostream &Stderr) {
   Error Err = Error::success();
-  auto RI = std::make_unique<RewriteInstance>(File, Argc, Argv, ToolPath, Err);
+  auto RI = std::make_unique<RewriteInstance>(File, Argc, Argv, ToolPath,
+                                              Stdout, Stderr, Err);
   if (Err)
     return std::move(Err);
   return std::move(RI);
@@ -319,6 +321,7 @@ RewriteInstance::create(ELFObjectFileBase *File, const int Argc,
 
 RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
                                  const char *const *Argv, StringRef ToolPath,
+                                 raw_ostream &Stdout, raw_ostream &Stderr,
                                  Error &Err)
     : InputFile(File), Argc(Argc), Argv(Argv), ToolPath(ToolPath),
       SHStrTab(StringTableBuilder::ELF) {
@@ -333,17 +336,23 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
   bool IsPIC = false;
   const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
   if (Obj.getHeader().e_type != ELF::ET_EXEC) {
-    outs() << "BOLT-INFO: shared object or position-independent executable "
+    Stdout << "BOLT-INFO: shared object or position-independent executable "
               "detected\n";
     IsPIC = true;
   }
 
+  // Make sure we don't miss any output on core dumps.
+  Stdout.SetUnbuffered();
+  Stderr.SetUnbuffered();
+  LLVM_DEBUG(dbgs().SetUnbuffered());
+
   auto BCOrErr = BinaryContext::createBinaryContext(
       File, IsPIC,
       DWARFContext::create(*File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, opts::DWPPathName,
                            WithColor::defaultErrorHandler,
-                           WithColor::defaultWarningHandler));
+                           WithColor::defaultWarningHandler),
+      JournalingStreams{Stdout, Stderr});
   if (Error E = BCOrErr.takeError()) {
     Err = std::move(E);
     return;
@@ -457,22 +466,23 @@ void RewriteInstance::markGnuRelroSections() {
     bool VMAContains = checkVMA<ELFT>(Phdr, *Sec, VMAOverlap);
     if (ImageOverlap) {
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: GNU_RELRO segment has partial file offset "
-               << "overlap with section " << BinarySection->getName() << '\n';
+        BC->errs() << "BOLT-WARNING: GNU_RELRO segment has partial file offset "
+                   << "overlap with section " << BinarySection->getName()
+                   << '\n';
       return;
     }
     if (VMAOverlap) {
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: GNU_RELRO segment has partial VMA overlap "
-               << "with section " << BinarySection->getName() << '\n';
+        BC->errs() << "BOLT-WARNING: GNU_RELRO segment has partial VMA overlap "
+                   << "with section " << BinarySection->getName() << '\n';
       return;
     }
     if (!ImageContains || !VMAContains)
       return;
     BinarySection->setRelro();
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: marking " << BinarySection->getName()
-             << " as GNU_RELRO\n";
+      BC->outs() << "BOLT-INFO: marking " << BinarySection->getName()
+                 << " as GNU_RELRO\n";
   };
 
   for (const ELFT::Phdr &Phdr : cantFail(Obj.program_headers()))
@@ -523,7 +533,7 @@ Error RewriteInstance::discoverStorage() {
   }
 
   if (BC->IsLinuxKernel)
-    outs() << "BOLT-INFO: Linux kernel binary detected\n";
+    BC->outs() << "BOLT-INFO: Linux kernel binary detected\n";
 
   for (const SectionRef &Section : InputFile->sections()) {
     Expected<StringRef> SectionNameOrErr = Section.getName();
@@ -555,8 +565,8 @@ Error RewriteInstance::discoverStorage() {
     return createStringError(errc::executable_format_error,
                              "no PT_LOAD pheader seen");
 
-  outs() << "BOLT-INFO: first alloc address is 0x"
-         << Twine::utohexstr(BC->FirstAllocAddress) << '\n';
+  BC->outs() << "BOLT-INFO: first alloc address is 0x"
+             << Twine::utohexstr(BC->FirstAllocAddress) << '\n';
 
   FirstNonAllocatableOffset = NextAvailableOffset;
 
@@ -588,9 +598,9 @@ Error RewriteInstance::discoverStorage() {
                NextAvailableAddress - BC->FirstAllocAddress &&
            "PHDR table address calculation error");
 
-    outs() << "BOLT-INFO: creating new program header table at address 0x"
-           << Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
-           << Twine::utohexstr(NextAvailableOffset) << '\n';
+    BC->outs() << "BOLT-INFO: creating new program header table at address 0x"
+               << Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
+               << Twine::utohexstr(NextAvailableOffset) << '\n';
 
     PHDRTableAddress = NextAvailableAddress;
     PHDRTableOffset = NextAvailableOffset;
@@ -685,7 +695,8 @@ void RewriteInstance::patchBuildID() {
 
   uint64_t FileOffset = getFileOffsetForAddress(BuildIDSection->getAddress());
   if (!FileOffset) {
-    errs() << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n";
+    BC->errs()
+        << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n";
     return;
   }
 
@@ -693,17 +704,17 @@ void RewriteInstance::patchBuildID() {
   LastIDByte ^= 1;
   OS.pwrite(&LastIDByte, 1, FileOffset + IDOffset + BuildID.size() - 1);
 
-  outs() << "BOLT-INFO: patched build-id (flipped last bit)\n";
+  BC->outs() << "BOLT-INFO: patched build-id (flipped last bit)\n";
 }
 
 Error RewriteInstance::run() {
   assert(BC && "failed to create a binary context");
 
-  outs() << "BOLT-INFO: Target architecture: "
-         << Triple::getArchTypeName(
-                (llvm::Triple::ArchType)InputFile->getArch())
-         << "\n";
-  outs() << "BOLT-INFO: BOLT version: " << BoltRevision << "\n";
+  BC->outs() << "BOLT-INFO: Target architecture: "
+             << Triple::getArchTypeName(
+                    (llvm::Triple::ArchType)InputFile->getArch())
+             << "\n";
+  BC->outs() << "BOLT-INFO: BOLT version: " << BoltRevision << "\n";
 
   if (Error E = discoverStorage())
     return E;
@@ -758,10 +769,10 @@ Error RewriteInstance::run() {
     updateRtFiniReloc();
 
   if (opts::OutputFilename == "/dev/null") {
-    outs() << "BOLT-INFO: skipping writing final binary to disk\n";
+    BC->outs() << "BOLT-INFO: skipping writing final binary to disk\n";
     return Error::success();
   } else if (BC->IsLinuxKernel) {
-    errs() << "BOLT-WARNING: Linux kernel support is experimental\n";
+    BC->errs() << "BOLT-WARNING: Linux kernel support is experimental\n";
   }
 
   // Rewrite allocatable contents and copy non-allocatable parts with mods.
@@ -786,13 +797,15 @@ void RewriteInstance::discoverFileObjects() {
   for (const ELFSymbolRef &Symbol : InputFile->symbols()) {
     Expected<StringRef> NameOrError = Symbol.getName();
     if (NameOrError && NameOrError->starts_with("__asan_init")) {
-      errs() << "BOLT-ERROR: input file was compiled or linked with sanitizer "
-                "support. Cannot optimize.\n";
+      BC->errs()
+          << "BOLT-ERROR: input file was compiled or linked with sanitizer "
+             "support. Cannot optimize.\n";
       exit(1);
     }
     if (NameOrError && NameOrError->starts_with("__llvm_coverage_mapping")) {
-      errs() << "BOLT-ERROR: input file was compiled or linked with coverage "
-                "support. Cannot optimize.\n";
+      BC->errs()
+          << "BOLT-ERROR: input file was compiled or linked with coverage "
+             "support. Cannot optimize.\n";
       exit(1);
     }
 
@@ -924,7 +937,7 @@ void RewriteInstance::discoverFileObjects() {
     StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name");
     if (SymbolAddress == 0) {
       if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function)
-        errs() << "BOLT-WARNING: function with 0 address seen\n";
+        BC->errs() << "BOLT-WARNING: function with 0 address seen\n";
       continue;
     }
 
@@ -965,13 +978,13 @@ void RewriteInstance::discoverFileObjects() {
         if (BD->getSize() == ELFSymbolRef(Symbol).getSize() &&
             BD->getAddress() == SymbolAddress) {
           if (opts::Verbosity > 1)
-            errs() << "BOLT-WARNING: ignoring duplicate global symbol " << Name
-                   << "\n";
+            BC->errs() << "BOLT-WARNING: ignoring duplicate global symbol "
+                       << Name << "\n";
           // Ignore duplicate entry - possibly a bug in the linker
           continue;
         }
-        errs() << "BOLT-ERROR: bad input binary, global symbol \"" << Name
-               << "\" is not unique\n";
+        BC->errs() << "BOLT-ERROR: bad input binary, global symbol \"" << Name
+                   << "\" is not unique\n";
         exit(1);
       }
       UniqueName = Name;
@@ -1059,9 +1072,9 @@ void RewriteInstance::discoverFileObjects() {
                    !SymbolSize) {
           LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n");
         } else if (opts::Verbosity > 1) {
-          errs() << "BOLT-WARNING: symbol " << UniqueName
-                 << " seen in the middle of function " << *PreviousFunction
-                 << ". Could be a new entry.\n";
+          BC->errs() << "BOLT-WARNING: symbol " << UniqueName
+                     << " seen in the middle of function " << *PreviousFunction
+                     << ". Could be a new entry.\n";
         }
         registerName(SymbolSize);
         continue;
@@ -1077,12 +1090,14 @@ void RewriteInstance::discoverFileObjects() {
         PreviousFunction->getAddress() != SymbolAddress) {
       if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: skipping possibly another entry for function "
-                 << *PreviousFunction << " : " << UniqueName << '\n';
+          BC->outs()
+              << "BOLT-INFO: skipping possibly another entry for function "
+              << *PreviousFunction << " : " << UniqueName << '\n';
         registerName(SymbolSize);
       } else {
-        outs() << "BOLT-INFO: using " << UniqueName << " as another entry to "
-               << "function " << *PreviousFunction << '\n';
+        BC->outs() << "BOLT-INFO: using " << UniqueName
+                   << " as another entry to "
+                   << "function " << *PreviousFunction << '\n';
 
         registerName(0);
 
@@ -1114,20 +1129,21 @@ void RewriteInstance::discoverFileObjects() {
           uint64_t PrevLength = PrevFDE.getAddressRange();
           if (SymbolAddress > PrevStart &&
               SymbolAddress < PrevStart + PrevLength) {
-            errs() << "BOLT-ERROR: function " << UniqueName
-                   << " is in conflict with FDE ["
-                   << Twine::utohexstr(PrevStart) << ", "
-                   << Twine::utohexstr(PrevStart + PrevLength)
-                   << "). Skipping.\n";
+            BC->errs() << "BOLT-ERROR: function " << UniqueName
+                       << " is in conflict with FDE ["
+                       << Twine::utohexstr(PrevStart) << ", "
+                       << Twine::utohexstr(PrevStart + PrevLength)
+                       << "). Skipping.\n";
             IsSimple = false;
           }
         }
       } else if (FDE.getAddressRange() != SymbolSize) {
         if (SymbolSize) {
           // Function addresses match but sizes differ.
-          errs() << "BOLT-WARNING: sizes differ for function " << UniqueName
-                 << ". FDE : " << FDE.getAddressRange()
-                 << "; symbol table : " << SymbolSize << ". Using max size.\n";
+          BC->errs() << "BOLT-WARNING: sizes differ for function " << UniqueName
+                     << ". FDE : " << FDE.getAddressRange()
+                     << "; symbol table : " << SymbolSize
+                     << ". Using max size.\n";
         }
         SymbolSize = std::max(SymbolSize, FDE.getAddressRange());
         if (BC->getBinaryDataAtAddress(SymbolAddress)) {
@@ -1151,10 +1167,11 @@ void RewriteInstance::discoverFileObjects() {
       if (SymbolSize != BF->getSize()) {
         if (opts::Verbosity >= 1) {
           if (SymbolSize && BF->getSize())
-            errs() << "BOLT-WARNING: size mismatch for duplicate entries "
-                   << *BF << " and " << UniqueName << '\n';
-          outs() << "BOLT-INFO: adjusting size of function " << *BF << " old "
-                 << BF->getSize() << " new " << SymbolSize << "\n";
+            BC->errs() << "BOLT-WARNING: size mismatch for duplicate entries "
+                       << *BF << " and " << UniqueName << '\n';
+          BC->outs() << "BOLT-INFO: adjusting size of function " << *BF
+                     << " old " << BF->getSize() << " new " << SymbolSize
+                     << "\n";
         }
         BF->setSize(std::max(SymbolSize, BF->getSize()));
         BC->setBinaryDataSize(SymbolAddress, BF->getSize());
@@ -1165,9 +1182,9 @@ void RewriteInstance::discoverFileObjects() {
           BC->getSectionForAddress(SymbolAddress);
       // Skip symbols from invalid sections
       if (!Section) {
-        errs() << "BOLT-WARNING: " << UniqueName << " (0x"
-               << Twine::utohexstr(SymbolAddress)
-               << ") does not have any section\n";
+        BC->errs() << "BOLT-WARNING: " << UniqueName << " (0x"
+                   << Twine::utohexstr(SymbolAddress)
+                   << ") does not have any section\n";
         continue;
       }
 
@@ -1186,12 +1203,12 @@ void RewriteInstance::discoverFileObjects() {
       static bool PrintedWarning = false;
       if (!PrintedWarning) {
         PrintedWarning = true;
-        errs() << "BOLT-WARNING: split function detected on input : "
-               << SymName;
+        BC->errs() << "BOLT-WARNING: split function detected on input : "
+                   << SymName;
         if (BC->HasRelocations)
-          errs() << ". The support is limited in relocation mode\n";
+          BC->errs() << ". The support is limited in relocation mode\n";
         else
-          errs() << '\n';
+          BC->errs() << '\n';
       }
       BC->HasSplitFunctions = true;
       BF->IsFragment = true;
@@ -1222,16 +1239,16 @@ void RewriteInstance::discoverFileObjects() {
 
     BF = BC->getBinaryFunctionContainingAddress(Address);
     if (BF) {
-      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x"
-             << Twine::utohexstr(Address + FDE->getAddressRange())
-             << ") conflicts with function " << *BF << '\n';
+      BC->errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address)
+                 << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange())
+                 << ") conflicts with function " << *BF << '\n';
       continue;
     }
 
     if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x"
-             << Twine::utohexstr(Address + FDE->getAddressRange())
-             << ") has no corresponding symbol table entry\n";
+      BC->errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address)
+                 << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange())
+                 << ") has no corresponding symbol table entry\n";
 
     ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
     assert(Section && "cannot get section for address from FDE");
@@ -1283,7 +1300,8 @@ void RewriteInstance::discoverFileObjects() {
                                                    /*UseMaxSize*/ true);
         if (BF) {
           assert(Rel.isRelative() && "Expected relative relocation for island");
-          BF->markIslandDynamicRelocationAtAddress(RelAddress);
+          BC->logBOLTErrorsAndQuitOnFatal(
+              BF->markIslandDynamicRelocationAtAddress(RelAddress));
         }
       }
     }
@@ -1395,23 +1413,24 @@ void RewriteInstance::registerFragments() {
       }
       if (!BD) {
         if (opts::Verbosity >= 1)
-          outs() << "BOLT-INFO: parent function not found for " << Name << "\n";
+          BC->outs() << "BOLT-INFO: parent function not found for " << Name
+                     << "\n";
         continue;
       }
       const uint64_t Address = BD->getAddress();
       BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
       if (!BF) {
         if (opts::Verbosity >= 1)
-          outs() << formatv("BOLT-INFO: parent function not found at {0:x}\n",
-                            Address);
+          BC->outs() << formatv(
+              "BOLT-INFO: parent function not found at {0:x}\n", Address);
         continue;
       }
       BC->registerFragment(Function, *BF);
       ++ParentsFound;
     }
     if (!ParentsFound) {
-      errs() << "BOLT-ERROR: parent function not found for " << Function
-             << '\n';
+      BC->errs() << "BOLT-ERROR: parent function not found for " << Function
+                 << '\n';
       exit(1);
     }
   }
@@ -1449,7 +1468,7 @@ void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
     // IFUNC trampoline without symbol
     BinaryFunction *TargetBF = BC->getBinaryFunctionAtAddress(Rel->Addend);
     if (!TargetBF) {
-      errs()
+      BC->errs()
           << "BOLT-WARNING: Expected BF to be presented as IFUNC resolver at "
           << Twine::utohexstr(Rel->Addend) << ", skipping\n";
       return;
@@ -1483,8 +1502,9 @@ void RewriteInstance::disassemblePLTInstruction(const BinarySection &Section,
   if (!BC->DisAsm->getInstruction(Instruction, InstrSize,
                                   PLTData.slice(InstrOffset), InstrAddr,
                                   nulls())) {
-    errs() << "BOLT-ERROR: unable to disassemble instruction in PLT section "
-           << Section.getName() << formatv(" at offset {0:x}\n", InstrOffset);
+    BC->errs()
+        << "BOLT-ERROR: unable to disassemble instruction in PLT section "
+        << Section.getName() << formatv(" at offset {0:x}\n", InstrOffset);
     exit(1);
   }
 }
@@ -1546,9 +1566,10 @@ void RewriteInstance::disassemblePLTSectionRISCV(BinarySection &Section) {
     if (!BC->DisAsm->getInstruction(Instruction, InstrSize,
                                     PLTData.slice(InstrOffset), InstrAddr,
                                     nulls())) {
-      errs() << "BOLT-ERROR: unable to disassemble instruction in PLT section "
-             << Section.getName() << " at offset 0x"
-             << Twine::utohexstr(InstrOffset) << '\n';
+      BC->errs()
+          << "BOLT-ERROR: unable to disassemble instruction in PLT section "
+          << Section.getName() << " at offset 0x"
+          << Twine::utohexstr(InstrOffset) << '\n';
       exit(1);
     }
   };
@@ -1606,8 +1627,8 @@ void RewriteInstance::disassemblePLTSectionX86(BinarySection &Section,
     if (!BC->MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
                                            SectionAddress + InstrOffset,
                                            InstrSize)) {
-      errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x"
-             << Twine::utohexstr(SectionAddress + InstrOffset) << '\n';
+      BC->errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x"
+                 << Twine::utohexstr(SectionAddress + InstrOffset) << '\n';
       exit(1);
     }
 
@@ -1714,8 +1735,8 @@ void RewriteInstance::adjustFunctionBoundaries() {
 
     const uint64_t MaxSize = NextObjectAddress - Function.getAddress();
     if (MaxSize < Function.getSize()) {
-      errs() << "BOLT-ERROR: symbol seen in the middle of the function "
-             << Function << ". Skipping.\n";
+      BC->errs() << "BOLT-ERROR: symbol seen in the middle of the function "
+                 << Function << ". Skipping.\n";
       Function.setSimple(false);
       Function.setMaxSize(Function.getSize());
       continue;
@@ -1725,8 +1746,8 @@ void RewriteInstance::adjustFunctionBoundaries() {
       // Some assembly functions have their size set to 0, use the max
       // size as their real size.
       if (opts::Verbosity >= 1)
-        outs() << "BOLT-INFO: setting size of function " << Function << " to "
-               << Function.getMaxSize() << " (was 0)\n";
+        BC->outs() << "BOLT-INFO: setting size of function " << Function
+                   << " to " << Function.getMaxSize() << " (was 0)\n";
       Function.setSize(Function.getMaxSize());
     }
   }
@@ -1812,8 +1833,8 @@ Error RewriteInstance::readSpecialSections() {
   markGnuRelroSections();
 
   if (HasDebugInfo && !opts::UpdateDebugSections && !opts::AggregateOnly) {
-    errs() << "BOLT-WARNING: debug info will be stripped from the binary. "
-              "Use -update-debug-sections to keep it.\n";
+    BC->errs() << "BOLT-WARNING: debug info will be stripped from the binary. "
+                  "Use -update-debug-sections to keep it.\n";
   }
 
   HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text");
@@ -1825,22 +1846,23 @@ Error RewriteInstance::readSpecialSections() {
           BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) {
     // Do not read BAT when plotting a heatmap
     if (!opts::HeatmapMode) {
-      if (std::error_code EC = BAT->parse(BATSec->getContents())) {
-        errs() << "BOLT-ERROR: failed to parse BOLT address translation "
-                  "table.\n";
+      if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) {
+        BC->errs() << "BOLT-ERROR: failed to parse BOLT address translation "
+                      "table.\n";
         exit(1);
       }
     }
   }
 
   if (opts::PrintSections) {
-    outs() << "BOLT-INFO: Sections from original binary:\n";
-    BC->printSections(outs());
+    BC->outs() << "BOLT-INFO: Sections from original binary:\n";
+    BC->printSections(BC->outs());
   }
 
   if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) {
-    errs() << "BOLT-ERROR: relocations against code are missing from the input "
-              "file. Cannot proceed in relocations mode (-relocs).\n";
+    BC->errs()
+        << "BOLT-ERROR: relocations against code are missing from the input "
+           "file. Cannot proceed in relocations mode (-relocs).\n";
     exit(1);
   }
 
@@ -1848,15 +1870,16 @@ Error RewriteInstance::readSpecialSections() {
       HasTextRelocations && (opts::RelocationMode != cl::BOU_FALSE);
 
   if (BC->IsLinuxKernel && BC->HasRelocations) {
-    outs() << "BOLT-INFO: disabling relocation mode for Linux kernel\n";
+    BC->outs() << "BOLT-INFO: disabling relocation mode for Linux kernel\n";
     BC->HasRelocations = false;
   }
 
   BC->IsStripped = !HasSymbolTable;
 
   if (BC->IsStripped && !opts::AllowStripped) {
-    errs() << "BOLT-ERROR: stripped binaries are not supported. If you know "
-              "what you're doing, use --allow-stripped to proceed";
+    BC->errs()
+        << "BOLT-ERROR: stripped binaries are not supported. If you know "
+           "what you're doing, use --allow-stripped to proceed";
     exit(1);
   }
 
@@ -1865,14 +1888,14 @@ Error RewriteInstance::readSpecialSections() {
     BC->HasRelocations = false;
 
   if (BC->HasRelocations)
-    outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
-           << "relocation mode\n";
+    BC->outs() << "BOLT-INFO: enabling " << (opts::StrictMode ? "strict " : "")
+               << "relocation mode\n";
 
   // Read EH frame for function boundaries info.
   Expected<const DWARFDebugFrame *> EHFrameOrError = BC->DwCtx->getEHFrame();
   if (!EHFrameOrError)
     report_error("expected valid eh_frame section", EHFrameOrError.takeError());
-  CFIRdWrt.reset(new CFIReaderWriter(*EHFrameOrError.get()));
+  CFIRdWrt.reset(new CFIReaderWriter(*BC, *EHFrameOrError.get()));
 
   // Parse build-id
   parseBuildID();
@@ -1885,63 +1908,68 @@ Error RewriteInstance::readSpecialSections() {
 
 void RewriteInstance::adjustCommandLineOptions() {
   if (BC->isAArch64() && !BC->HasRelocations)
-    errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully "
-              "supported\n";
+    BC->errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully "
+                  "supported\n";
 
   if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary())
     RtLibrary->adjustCommandLineOptions(*BC);
 
   if (opts::AlignMacroOpFusion != MFT_NONE && !BC->isX86()) {
-    outs() << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n";
+    BC->outs()
+        << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (BC->isX86() && BC->MAB->allowAutoPadding()) {
     if (!BC->HasRelocations) {
-      errs() << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
-                "non-relocation mode\n";
+      BC->errs()
+          << "BOLT-ERROR: cannot apply mitigations for Intel JCC erratum in "
+             "non-relocation mode\n";
       exit(1);
     }
-    outs() << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
-              "may take several minutes\n";
+    BC->outs()
+        << "BOLT-WARNING: using mitigation for Intel JCC erratum, layout "
+           "may take several minutes\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (opts::AlignMacroOpFusion != MFT_NONE && !BC->HasRelocations) {
-    outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
-              "mode\n";
+    BC->outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation "
+                  "mode\n";
     opts::AlignMacroOpFusion = MFT_NONE;
   }
 
   if (opts::SplitEH && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n";
     opts::SplitEH = false;
   }
 
   if (opts::StrictMode && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
-              "mode\n";
+    BC->errs()
+        << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
+           "mode\n";
     opts::StrictMode = false;
   }
 
   if (BC->HasRelocations && opts::AggregateOnly &&
       !opts::StrictMode.getNumOccurrences()) {
-    outs() << "BOLT-INFO: enabling strict relocation mode for aggregation "
-              "purposes\n";
+    BC->outs() << "BOLT-INFO: enabling strict relocation mode for aggregation "
+                  "purposes\n";
     opts::StrictMode = true;
   }
 
   if (BC->isX86() && BC->HasRelocations &&
       opts::AlignMacroOpFusion == MFT_HOT && !ProfileReader) {
-    outs() << "BOLT-INFO: enabling -align-macro-fusion=all since no profile "
-              "was specified\n";
+    BC->outs()
+        << "BOLT-INFO: enabling -align-macro-fusion=all since no profile "
+           "was specified\n";
     opts::AlignMacroOpFusion = MFT_ALL;
   }
 
   if (!BC->HasRelocations &&
       opts::ReorderFunctions != ReorderFunctions::RT_NONE) {
-    errs() << "BOLT-ERROR: function reordering only works when "
-           << "relocations are enabled\n";
+    BC->errs() << "BOLT-ERROR: function reordering only works when "
+               << "relocations are enabled\n";
     exit(1);
   }
 
@@ -1950,7 +1978,7 @@ void RewriteInstance::adjustCommandLineOptions() {
        !opts::HotText.getNumOccurrences())) {
     opts::HotText = true;
   } else if (opts::HotText && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
     opts::HotText = false;
   }
 
@@ -1961,12 +1989,13 @@ void RewriteInstance::adjustCommandLineOptions() {
   }
 
   if (opts::UseOldText && !BC->OldTextSectionAddress) {
-    errs() << "BOLT-WARNING: cannot use old .text as the section was not found"
-              "\n";
+    BC->errs()
+        << "BOLT-WARNING: cannot use old .text as the section was not found"
+           "\n";
     opts::UseOldText = false;
   }
   if (opts::UseOldText && !BC->HasRelocations) {
-    errs() << "BOLT-WARNING: cannot use old .text in non-relocation mode\n";
+    BC->errs() << "BOLT-WARNING: cannot use old .text in non-relocation mode\n";
     opts::UseOldText = false;
   }
 
@@ -1981,23 +2010,25 @@ void RewriteInstance::adjustCommandLineOptions() {
     opts::Lite = true;
 
   if (opts::Lite && opts::UseOldText) {
-    errs() << "BOLT-WARNING: cannot combine -lite with -use-old-text. "
-              "Disabling -use-old-text.\n";
+    BC->errs() << "BOLT-WARNING: cannot combine -lite with -use-old-text. "
+                  "Disabling -use-old-text.\n";
     opts::UseOldText = false;
   }
 
   if (opts::Lite && opts::StrictMode) {
-    errs() << "BOLT-ERROR: -strict and -lite cannot be used at the same time\n";
+    BC->errs()
+        << "BOLT-ERROR: -strict and -lite cannot be used at the same time\n";
     exit(1);
   }
 
   if (opts::Lite)
-    outs() << "BOLT-INFO: enabling lite mode\n";
+    BC->outs() << "BOLT-INFO: enabling lite mode\n";
 
   if (!opts::SaveProfile.empty() && BAT->enabledFor(InputFile)) {
-    errs() << "BOLT-ERROR: unable to save profile in YAML format for input "
-              "file processed by BOLT. Please remove -w option and use branch "
-              "profile.\n";
+    BC->errs()
+        << "BOLT-ERROR: unable to save profile in YAML format for input "
+           "file processed by BOLT. Please remove -w option and use branch "
+           "profile.\n";
     exit(1);
   }
 }
@@ -2252,8 +2283,8 @@ void RewriteInstance::processRelocations() {
   }
 
   if (NumFailedRelocations)
-    errs() << "BOLT-WARNING: Failed to analyze " << NumFailedRelocations
-           << " relocations\n";
+    BC->errs() << "BOLT-WARNING: Failed to analyze " << NumFailedRelocations
+               << " relocations\n";
 }
 
 void RewriteInstance::readDynamicRelocations(const SectionRef &Section,
@@ -2497,8 +2528,8 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     assert(ContainingBF && "cannot find function for address in code");
     if (!IsAArch64 && !ContainingBF->containsAddress(Rel.getOffset())) {
       if (opts::Verbosity >= 1)
-        outs() << formatv("BOLT-INFO: {0} has relocations in padding area\n",
-                          *ContainingBF);
+        BC->outs() << formatv(
+            "BOLT-INFO: {0} has relocations in padding area\n", *ContainingBF);
       ContainingBF->setSize(ContainingBF->getMaxSize());
       ContainingBF->setSimple(false);
       return;
@@ -2593,12 +2624,13 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
       if (BF != ReferencedBF) {
         // It's possible we are referencing a function without referencing any
         // code, e.g. when taking a bitmask action on a function address.
-        errs() << "BOLT-WARNING: non-standard function reference (e.g. bitmask)"
-               << formatv(" detected against function {0} from ", *BF);
+        BC->errs()
+            << "BOLT-WARNING: non-standard function reference (e.g. bitmask)"
+            << formatv(" detected against function {0} from ", *BF);
         if (IsFromCode)
-          errs() << formatv("function {0}\n", *ContainingBF);
+          BC->errs() << formatv("function {0}\n", *ContainingBF);
         else
-          errs() << formatv("data section at {0:x}\n", Rel.getOffset());
+          BC->errs() << formatv("data section at {0:x}\n", Rel.getOffset());
         LLVM_DEBUG(printRelocationInfo(Rel, SymbolName, SymbolAddress, Addend,
                                        ExtractedValue));
         ReferencedBF = BF;
@@ -2628,10 +2660,11 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
           llvm::make_second_range(ContainingBF->Relocations), CheckReloc);
 
       if (Found) {
-        errs() << "BOLT-WARNING: detected possible compiler de-virtualization "
-                  "bug: -1 addend used with non-pc-relative relocation against "
-               << formatv("function {0} in function {1}\n", *RogueBF,
-                          *ContainingBF);
+        BC->errs()
+            << "BOLT-WARNING: detected possible compiler de-virtualization "
+               "bug: -1 addend used with non-pc-relative relocation against "
+            << formatv("function {0} in function {1}\n", *RogueBF,
+                       *ContainingBF);
         return;
       }
     }
@@ -2681,9 +2714,10 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
         }
         if (opts::Verbosity > 1 &&
             BinarySection(*BC, RelocatedSection).isWritable())
-          errs() << "BOLT-WARNING: writable reference into the middle of the "
-                 << formatv("function {0} detected at address {1:x}\n",
-                            *ReferencedBF, Rel.getOffset());
+          BC->errs()
+              << "BOLT-WARNING: writable reference into the middle of the "
+              << formatv("function {0} detected at address {1:x}\n",
+                         *ReferencedBF, Rel.getOffset());
       }
       SymbolAddress = Address;
       Addend = 0;
@@ -2825,8 +2859,9 @@ void RewriteInstance::selectFunctionsToProcess() {
   if ((!opts::ForceFunctionNames.empty() ||
        !opts::ForceFunctionNamesNR.empty()) &&
       !opts::SkipFunctionNames.empty()) {
-    errs() << "BOLT-ERROR: cannot select functions to process and skip at the "
-              "same time. Please use only one type of selection.\n";
+    BC->errs()
+        << "BOLT-ERROR: cannot select functions to process and skip at the "
+           "same time. Please use only one type of selection.\n";
     exit(1);
   }
 
@@ -2850,8 +2885,8 @@ void RewriteInstance::selectFunctionsToProcess() {
     if (Index)
       --Index;
     LiteThresholdExecCount = TopFunctions[Index]->getKnownExecutionCount();
-    outs() << "BOLT-INFO: limiting processing to functions with at least "
-           << LiteThresholdExecCount << " invocations\n";
+    BC->outs() << "BOLT-INFO: limiting processing to functions with at least "
+               << LiteThresholdExecCount << " invocations\n";
   }
   LiteThresholdExecCount = std::max(
       LiteThresholdExecCount, static_cast<uint64_t>(opts::LiteThresholdCount));
@@ -2859,8 +2894,10 @@ void RewriteInstance::selectFunctionsToProcess() {
   StringSet<> ReorderFunctionsUserSet;
   StringSet<> ReorderFunctionsLTOCommonSet;
   if (opts::ReorderFunctions == ReorderFunctions::RT_USER) {
-    for (const std::string &Function :
-         ReorderFunctions::readFunctionOrderFile()) {
+    std::vector<std::string> FunctionNames;
+    BC->logBOLTErrorsAndQuitOnFatal(
+        ReorderFunctions::readFunctionOrderFile(FunctionNames));
+    for (const std::string &Function : FunctionNames) {
       ReorderFunctionsUserSet.insert(Function);
       if (std::optional<StringRef> LTOCommonName = getLTOCommonName(Function))
         ReorderFunctionsLTOCommonSet.insert(*LTOCommonName);
@@ -2936,15 +2973,15 @@ void RewriteInstance::selectFunctionsToProcess() {
 
     if (!shouldProcess(Function)) {
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: skipping processing " << Function
-               << " per user request\n";
+        BC->outs() << "BOLT-INFO: skipping processing " << Function
+                   << " per user request\n";
       }
       Function.setIgnored();
     } else {
       ++NumFunctionsToProcess;
       if (opts::MaxFunctions.getNumOccurrences() &&
           NumFunctionsToProcess == opts::MaxFunctions)
-        outs() << "BOLT-INFO: processing ending on " << Function << '\n';
+        BC->outs() << "BOLT-INFO: processing ending on " << Function << '\n';
     }
   }
 
@@ -2963,8 +3000,8 @@ void RewriteInstance::selectFunctionsToProcess() {
     if (mustSkip(Function)) {
       for (BinaryFunction *Parent : Function.ParentFragments) {
         if (opts::Verbosity >= 1) {
-          outs() << "BOLT-INFO: skipping processing " << *Parent
-                 << " together with fragment function\n";
+          BC->outs() << "BOLT-INFO: skipping processing " << *Parent
+                     << " together with fragment function\n";
         }
         Parent->setIgnored();
         --NumFunctionsToProcess;
@@ -2979,18 +3016,18 @@ void RewriteInstance::selectFunctionsToProcess() {
         });
     if (IgnoredParent) {
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: skipping processing " << Function
-               << " together with parent function\n";
+        BC->outs() << "BOLT-INFO: skipping processing " << Function
+                   << " together with parent function\n";
       }
       Function.setIgnored();
     } else {
       ++NumFunctionsToProcess;
       if (opts::Verbosity >= 1) {
-        outs() << "BOLT-INFO: processing " << Function
-               << " as a sibling of non-ignored function\n";
+        BC->outs() << "BOLT-INFO: processing " << Function
+                   << " as a sibling of non-ignored function\n";
       }
       if (opts::MaxFunctions && NumFunctionsToProcess == opts::MaxFunctions)
-        outs() << "BOLT-INFO: processing ending on " << Function << '\n';
+        BC->outs() << "BOLT-INFO: processing ending on " << Function << '\n';
     }
   }
 }
@@ -3011,12 +3048,12 @@ void RewriteInstance::preprocessProfileData() {
   NamedRegionTimer T("preprocessprofile", "pre-process profile data",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
 
-  outs() << "BOLT-INFO: pre-processing profile using "
-         << ProfileReader->getReaderName() << '\n';
+  BC->outs() << "BOLT-INFO: pre-processing profile using "
+             << ProfileReader->getReaderName() << '\n';
 
   if (BAT->enabledFor(InputFile)) {
-    outs() << "BOLT-INFO: profile collection done on a binary already "
-              "processed by BOLT\n";
+    BC->outs() << "BOLT-INFO: profile collection done on a binary already "
+                  "processed by BOLT\n";
     ProfileReader->setBAT(&*BAT);
   }
 
@@ -3024,10 +3061,11 @@ void RewriteInstance::preprocessProfileData() {
     report_error("cannot pre-process profile", std::move(E));
 
   if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName()) {
-    errs() << "BOLT-ERROR: input binary does not have local file symbols "
-              "but profile data includes function names with embedded file "
-              "names. It appears that the input binary was stripped while a "
-              "profiled binary was not\n";
+    BC->errs()
+        << "BOLT-ERROR: input binary does not have local file symbols "
+           "but profile data includes function names with embedded file "
+           "names. It appears that the input binary was stripped while a "
+           "profiled binary was not\n";
     exit(1);
   }
 }
@@ -3080,7 +3118,7 @@ void RewriteInstance::processProfileData() {
       if (Function.empty())
         continue;
 
-      Function.print(outs(), "after attaching profile");
+      Function.print(BC->outs(), "after attaching profile");
     }
   }
 
@@ -3109,8 +3147,8 @@ void RewriteInstance::disassembleFunctions() {
 
     ErrorOr<ArrayRef<uint8_t>> FunctionData = Function.getData();
     if (!FunctionData) {
-      errs() << "BOLT-ERROR: corresponding section is non-executable or "
-             << "empty for function " << Function << '\n';
+      BC->errs() << "BOLT-ERROR: corresponding section is non-executable or "
+                 << "empty for function " << Function << '\n';
       exit(1);
     }
 
@@ -3133,21 +3171,32 @@ void RewriteInstance::disassembleFunctions() {
       continue;
     }
 
-    if (!Function.disassemble()) {
-      if (opts::processAllFunctions())
-        BC->exitWithBugReport("function cannot be properly disassembled. "
-                              "Unable to continue in relocation mode.",
-                              Function);
+    bool DisasmFailed{false};
+    handleAllErrors(Function.disassemble(), [&](const BOLTError &E) {
+      DisasmFailed = true;
+      if (E.isFatal()) {
+        E.log(BC->errs());
+        exit(1);
+      }
+      if (opts::processAllFunctions()) {
+        BC->errs() << BC->generateBugReportMessage(
+            "function cannot be properly disassembled. "
+            "Unable to continue in relocation mode.",
+            Function);
+        exit(1);
+      }
       if (opts::Verbosity >= 1)
-        outs() << "BOLT-INFO: could not disassemble function " << Function
-               << ". Will ignore.\n";
+        BC->outs() << "BOLT-INFO: could not disassemble function " << Function
+                   << ". Will ignore.\n";
       // Forcefully ignore the function.
       Function.setIgnored();
+    });
+
+    if (DisasmFailed)
       continue;
-    }
 
     if (opts::PrintAll || opts::PrintDisasm)
-      Function.print(outs(), "after disassembly");
+      Function.print(BC->outs(), "after disassembly");
   }
 
   BC->processInterproceduralReferences();
@@ -3182,10 +3231,12 @@ void RewriteInstance::disassembleFunctions() {
     // Fill in CFI information for this function
     if (!Function.trapsOnEntry() && !CFIRdWrt->fillCFIInfoFor(Function)) {
       if (BC->HasRelocations) {
-        BC->exitWithBugReport("unable to fill CFI.", Function);
+        BC->errs() << BC->generateBugReportMessage("unable to fill CFI.",
+                                                   Function);
+        exit(1);
       } else {
-        errs() << "BOLT-WARNING: unable to fill CFI for function " << Function
-               << ". Skipping.\n";
+        BC->errs() << "BOLT-WARNING: unable to fill CFI for function "
+                   << Function << ". Skipping.\n";
         Function.setSimple(false);
         continue;
       }
@@ -3199,7 +3250,8 @@ void RewriteInstance::disassembleFunctions() {
       check_error(LSDASection.getError(), "failed to get LSDA section");
       ArrayRef<uint8_t> LSDAData = ArrayRef<uint8_t>(
           LSDASection->getData(), LSDASection->getContents().size());
-      Function.parseLSDA(LSDAData, LSDASection->getAddress());
+      BC->logBOLTErrorsAndQuitOnFatal(
+          Function.parseLSDA(LSDAData, LSDASection->getAddress()));
     }
   }
 }
@@ -3214,12 +3266,21 @@ void RewriteInstance::buildFunctionsCFG() {
 
   ParallelUtilities::WorkFuncWithAllocTy WorkFun =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
-        if (!BF.buildCFG(AllocId))
+        bool HadErrors{false};
+        handleAllErrors(BF.buildCFG(AllocId), [&](const BOLTError &E) {
+          if (!E.getMessage().empty())
+            E.log(BC->errs());
+          if (E.isFatal())
+            exit(1);
+          HadErrors = true;
+        });
+
+        if (HadErrors)
           return;
 
         if (opts::PrintAll) {
           auto L = BC->scopeLock();
-          BF.print(outs(), "while building cfg");
+          BF.print(BC->outs(), "while building cfg");
         }
       };
 
@@ -3258,14 +3319,14 @@ void RewriteInstance::postProcessFunctions() {
     Function.postProcessCFG();
 
     if (opts::PrintAll || opts::PrintCFG)
-      Function.print(outs(), "after building cfg");
+      Function.print(BC->outs(), "after building cfg");
 
     if (opts::DumpDotAll)
       Function.dumpGraphForPass("00_build-cfg");
 
     if (opts::PrintLoopInfo) {
       Function.calculateLoopInfo();
-      Function.printLoopInfo(outs());
+      Function.printLoopInfo(BC->outs());
     }
 
     BC->TotalScore += Function.getFunctionScore();
@@ -3273,15 +3334,15 @@ void RewriteInstance::postProcessFunctions() {
   }
 
   if (opts::PrintGlobals) {
-    outs() << "BOLT-INFO: Global symbols:\n";
-    BC->printGlobalSymbols(outs());
+    BC->outs() << "BOLT-INFO: Global symbols:\n";
+    BC->printGlobalSymbols(BC->outs());
   }
 }
 
 void RewriteInstance::runOptimizationPasses() {
   NamedRegionTimer T("runOptimizationPasses", "run optimization passes",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
-  BinaryFunctionPassManager::runAllPasses(*BC);
+  BC->logBOLTErrorsAndQuitOnFatal(BinaryFunctionPassManager::runAllPasses(*BC));
 }
 
 void RewriteInstance::preregisterSections() {
@@ -3331,7 +3392,7 @@ void RewriteInstance::emitAndLink() {
 
   Streamer->finish();
   if (Streamer->getContext().hadError()) {
-    errs() << "BOLT-ERROR: Emission failed.\n";
+    BC->errs() << "BOLT-ERROR: Emission failed.\n";
     exit(1);
   }
 
@@ -3342,9 +3403,10 @@ void RewriteInstance::emitAndLink() {
     raw_fd_ostream FOS(OutObjectPath, EC);
     check_error(EC, "cannot create output object file");
     FOS << ObjectBuffer;
-    outs() << "BOLT-INFO: intermediary output object file saved for debugging "
-              "purposes: "
-           << OutObjectPath << "\n";
+    BC->outs()
+        << "BOLT-INFO: intermediary output object file saved for debugging "
+           "purposes: "
+        << OutObjectPath << "\n";
   }
 
   ErrorOr<BinarySection &> TextSection =
@@ -3409,8 +3471,8 @@ void RewriteInstance::emitAndLink() {
   }
 
   if (opts::PrintCacheMetrics) {
-    outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
-    CacheMetrics::printAll(BC->getSortedFunctions());
+    BC->outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
+    CacheMetrics::printAll(BC->outs(), BC->getSortedFunctions());
   }
 }
 
@@ -3576,14 +3638,15 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
           allocateAt(BC->OldTextSectionAddress) - BC->OldTextSectionAddress;
 
       if (CodeSize <= BC->OldTextSectionSize) {
-        outs() << "BOLT-INFO: using original .text for new code with 0x"
-               << Twine::utohexstr(opts::AlignText) << " alignment\n";
+        BC->outs() << "BOLT-INFO: using original .text for new code with 0x"
+                   << Twine::utohexstr(opts::AlignText) << " alignment\n";
         AllocationDone = true;
       } else {
-        errs() << "BOLT-WARNING: original .text too small to fit the new code"
-               << " using 0x" << Twine::utohexstr(opts::AlignText)
-               << " alignment. " << CodeSize << " bytes needed, have "
-               << BC->OldTextSectionSize << " bytes available.\n";
+        BC->errs()
+            << "BOLT-WARNING: original .text too small to fit the new code"
+            << " using 0x" << Twine::utohexstr(opts::AlignText)
+            << " alignment. " << CodeSize << " bytes needed, have "
+            << BC->OldTextSectionSize << " bytes available.\n";
         opts::UseOldText = false;
       }
     }
@@ -3604,9 +3667,9 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
 
     // Check if we need to insert a padding section for hot text.
     if (PaddingSize && !opts::UseOldText)
-      outs() << "BOLT-INFO: padding code to 0x"
-             << Twine::utohexstr(NextAvailableAddress)
-             << " to accommodate hot text\n";
+      BC->outs() << "BOLT-INFO: padding code to 0x"
+                 << Twine::utohexstr(NextAvailableAddress)
+                 << " to accommodate hot text\n";
 
     return;
   }
@@ -3631,6 +3694,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
     Function.setImageAddress(FuncSection->getAllocAddress());
     Function.setImageSize(FuncSection->getOutputSize());
     if (Function.getImageSize() > Function.getMaxSize()) {
+      assert(!BC->isX86() && "Unexpected large function.");
       TooLarge = true;
       FailedAddresses.emplace_back(Function.getAddress());
     }
@@ -3814,7 +3878,7 @@ void RewriteInstance::patchELFPHDRTable() {
     assert(!PHDRTableAddress && "unexpected address for program header table");
     PHDRTableOffset = Obj.getHeader().e_phoff;
     if (NewWritableSegmentSize) {
-      errs() << "Unable to add writable segment with UseGnuStack option\n";
+      BC->errs() << "Unable to add writable segment with UseGnuStack option\n";
       exit(1);
     }
   }
@@ -4114,7 +4178,8 @@ void RewriteInstance::encodeBATSection() {
                                   copyByteArray(BoltInfo), BoltInfo.size(),
                                   /*Alignment=*/1,
                                   /*IsReadOnly=*/true, ELF::SHT_NOTE);
-  outs() << "BOLT-INFO: BAT section size (bytes): " << BoltInfo.size() << '\n';
+  BC->outs() << "BOLT-INFO: BAT section size (bytes): " << BoltInfo.size()
+             << '\n';
 }
 
 template <typename ELFShdrTy>
@@ -4175,14 +4240,14 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
 
     if (Section.hasSectionRef() || Section.isAnonymous()) {
       if (opts::Verbosity)
-        outs() << "BOLT-INFO: not writing section header for section "
-               << Section.getOutputName() << '\n';
+        BC->outs() << "BOLT-INFO: not writing section header for section "
+                   << Section.getOutputName() << '\n';
       continue;
     }
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: writing section header for "
-             << Section.getOutputName() << '\n';
+      BC->outs() << "BOLT-INFO: writing section header for "
+                 << Section.getOutputName() << '\n';
     ELFShdrTy NewSection;
     NewSection.sh_type = ELF::SHT_PROGBITS;
     NewSection.sh_addr = Section.getOutputAddress();
@@ -4216,8 +4281,8 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
     if (PrevSection &&
         PrevSection->sh_offset + PrevSection->sh_size > Section.sh_offset) {
       if (opts::Verbosity > 1) {
-        outs() << "BOLT-INFO: adjusting size for section "
-               << PrevBinSec->getOutputName() << '\n';
+        BC->outs() << "BOLT-INFO: adjusting size for section "
+                   << PrevBinSec->getOutputName() << '\n';
       }
       PrevSection->sh_size = Section.sh_offset - PrevSection->sh_offset;
     }
@@ -4264,8 +4329,8 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
       continue;
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: writing section header for "
-             << Section.getOutputName() << '\n';
+      BC->outs() << "BOLT-INFO: writing section header for "
+                 << Section.getOutputName() << '\n';
 
     ELFShdrTy NewSection;
     NewSection.sh_type = Section.getELFType();
@@ -4663,8 +4728,8 @@ void RewriteInstance::updateELFSymbolTable(
                                  std::optional<uint64_t> Value = std::nullopt) {
       NewSymbol.st_value = Value ? *Value : getNewValueForSymbol(Name);
       NewSymbol.st_shndx = ELF::SHN_ABS;
-      outs() << "BOLT-INFO: setting " << Name << " to 0x"
-             << Twine::utohexstr(NewSymbol.st_value) << '\n';
+      BC->outs() << "BOLT-INFO: setting " << Name << " to 0x"
+                 << Twine::utohexstr(NewSymbol.st_value) << '\n';
     };
 
     if (opts::HotText &&
@@ -4739,8 +4804,8 @@ void RewriteInstance::updateELFSymbolTable(
     Symbol.st_other = 0;
     Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE);
 
-    outs() << "BOLT-INFO: setting " << Name << " to 0x"
-           << Twine::utohexstr(Symbol.st_value) << '\n';
+    BC->outs() << "BOLT-INFO: setting " << Name << " to 0x"
+               << Twine::utohexstr(Symbol.st_value) << '\n';
 
     Symbols.emplace_back(Symbol);
   };
@@ -4836,7 +4901,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
     }
   }
   if (!SymTabSection) {
-    errs() << "BOLT-WARNING: no symbol table found\n";
+    BC->errs() << "BOLT-WARNING: no symbol table found\n";
     return;
   }
 
@@ -4939,7 +5004,7 @@ void RewriteInstance::patchELFAllocatableRelrSection(
 
   auto WriteRelr = [&](uint64_t Value) {
     if (RelrDynOffset + DynamicRelrEntrySize > RelrDynEndOffset) {
-      errs() << "BOLT-ERROR: Offset overflow for relr.dyn section\n";
+      BC->errs() << "BOLT-ERROR: Offset overflow for relr.dyn section\n";
       exit(1);
     }
 
@@ -5051,12 +5116,12 @@ RewriteInstance::patchELFAllocatableRelaSections(ELFObjectFile<ELFT> *File) {
         const uint64_t &EndOffset =
             IsJmpRel ? RelPltEndOffset : RelDynEndOffset;
         if (!Offset || !EndOffset) {
-          errs() << "BOLT-ERROR: Invalid offsets for dynamic relocation\n";
+          BC->errs() << "BOLT-ERROR: Invalid offsets for dynamic relocation\n";
           exit(1);
         }
 
         if (Offset + sizeof(NewRelA) > EndOffset) {
-          errs() << "BOLT-ERROR: Offset overflow for dynamic relocation\n";
+          BC->errs() << "BOLT-ERROR: Offset overflow for dynamic relocation\n";
           exit(1);
         }
 
@@ -5105,7 +5170,7 @@ void RewriteInstance::patchELFGOT(ELFObjectFile<ELFT> *File) {
   }
   if (!GOTSection.getObject()) {
     if (!BC->IsStaticExecutable)
-      errs() << "BOLT-INFO: no .got section found\n";
+      BC->errs() << "BOLT-INFO: no .got section found\n";
     return;
   }
 
@@ -5209,9 +5274,10 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile<ELFT> *File) {
   }
 
   if (BC->RequiresZNow && !ZNowSet) {
-    errs() << "BOLT-ERROR: output binary requires immediate relocation "
-              "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in "
-              ".dynamic. Please re-link the binary with -znow.\n";
+    BC->errs()
+        << "BOLT-ERROR: output binary requires immediate relocation "
+           "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in "
+           ".dynamic. Please re-link the binary with -znow.\n";
     exit(1);
   }
 }
@@ -5233,7 +5299,7 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
   }
 
   if (!DynamicPhdr) {
-    outs() << "BOLT-INFO: static input executable detected\n";
+    BC->outs() << "BOLT-INFO: static input executable detected\n";
     // TODO: static PIE executable might have dynamic header
     BC->IsStaticExecutable = true;
     return Error::success();
@@ -5307,12 +5373,12 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
     DynamicRelrAddress.reset();
     DynamicRelrSize = 0;
   } else if (!DynamicRelrEntrySize) {
-    errs() << "BOLT-ERROR: expected DT_RELRENT to be presented "
-           << "in DYNAMIC section\n";
+    BC->errs() << "BOLT-ERROR: expected DT_RELRENT to be presented "
+               << "in DYNAMIC section\n";
     exit(1);
   } else if (DynamicRelrSize % DynamicRelrEntrySize) {
-    errs() << "BOLT-ERROR: expected RELR table size to be divisible "
-           << "by RELR entry size\n";
+    BC->errs() << "BOLT-ERROR: expected RELR table size to be divisible "
+               << "by RELR entry size\n";
     exit(1);
   }
 
@@ -5367,12 +5433,13 @@ void RewriteInstance::rewriteFile() {
       continue;
 
     if (Function->getImageSize() > Function->getMaxSize()) {
+      assert(!BC->isX86() && "Unexpected large function.");
       if (opts::Verbosity >= 1)
-        errs() << "BOLT-WARNING: new function size (0x"
-               << Twine::utohexstr(Function->getImageSize())
-               << ") is larger than maximum allowed size (0x"
-               << Twine::utohexstr(Function->getMaxSize()) << ") for function "
-               << *Function << '\n';
+        BC->errs() << "BOLT-WARNING: new function size (0x"
+                   << Twine::utohexstr(Function->getImageSize())
+                   << ") is larger than maximum allowed size (0x"
+                   << Twine::utohexstr(Function->getMaxSize())
+                   << ") for function " << *Function << '\n';
 
       // Remove jump table sections that this function owns in non-reloc mode
       // because we don't want to write them anymore.
@@ -5408,7 +5475,7 @@ void RewriteInstance::rewriteFile() {
 
     // Overwrite function in the output file.
     if (opts::Verbosity >= 2)
-      outs() << "BOLT: rewriting function \"" << *Function << "\"\n";
+      BC->outs() << "BOLT: rewriting function \"" << *Function << "\"\n";
 
     OS.pwrite(reinterpret_cast<char *>(Function->getImageAddress()),
               Function->getImageSize(), Function->getFileOffset());
@@ -5428,8 +5495,8 @@ void RewriteInstance::rewriteFile() {
 
     // Write cold part
     if (opts::Verbosity >= 2) {
-      outs() << formatv("BOLT: rewriting function \"{0}\" (split parts)\n",
-                        *Function);
+      BC->outs() << formatv("BOLT: rewriting function \"{0}\" (split parts)\n",
+                            *Function);
     }
 
     for (const FunctionFragment &FF :
@@ -5441,14 +5508,15 @@ void RewriteInstance::rewriteFile() {
 
   // Print function statistics for non-relocation mode.
   if (!BC->HasRelocations) {
-    outs() << "BOLT: " << CountOverwrittenFunctions << " out of "
-           << BC->getBinaryFunctions().size()
-           << " functions were overwritten.\n";
+    BC->outs() << "BOLT: " << CountOverwrittenFunctions << " out of "
+               << BC->getBinaryFunctions().size()
+               << " functions were overwritten.\n";
     if (BC->TotalScore != 0) {
       double Coverage = OverwrittenScore / (double)BC->TotalScore * 100.0;
-      outs() << format("BOLT-INFO: rewritten functions cover %.2lf", Coverage)
-             << "% of the execution count of simple functions of "
-                "this binary\n";
+      BC->outs() << format("BOLT-INFO: rewritten functions cover %.2lf",
+                           Coverage)
+                 << "% of the execution count of simple functions of "
+                    "this binary\n";
     }
   }
 
@@ -5476,10 +5544,11 @@ void RewriteInstance::rewriteFile() {
       continue;
 
     if (opts::Verbosity >= 1)
-      outs() << "BOLT: writing new section " << Section.getName()
-             << "\n data at 0x" << Twine::utohexstr(Section.getAllocAddress())
-             << "\n of size " << Section.getOutputSize() << "\n at offset "
-             << Section.getOutputFileOffset() << '\n';
+      BC->outs() << "BOLT: writing new section " << Section.getName()
+                 << "\n data at 0x"
+                 << Twine::utohexstr(Section.getAllocAddress()) << "\n of size "
+                 << Section.getOutputSize() << "\n at offset "
+                 << Section.getOutputFileOffset() << '\n';
     OS.pwrite(reinterpret_cast<const char *>(Section.getOutputData()),
               Section.getOutputSize(), Section.getOutputFileOffset());
   }
@@ -5529,8 +5598,8 @@ void RewriteInstance::rewriteFile() {
   patchELFSectionHeaderTable();
 
   if (opts::PrintSections) {
-    outs() << "BOLT-INFO: Sections after processing:\n";
-    BC->printSections(outs());
+    BC->outs() << "BOLT-INFO: Sections after processing:\n";
+    BC->printSections(BC->outs());
   }
 
   Out->keep();
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.cpp b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
index ca7fe137152f..0e0ad9270550 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.cpp
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
@@ -134,7 +134,13 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   // a PC-relative 8-byte fixup, which is what we need to cover this. The
   // only way to do this is to use the symbol name _GLOBAL_OFFSET_TABLE_.
   if (Relocation::isX86GOTPC64(Relocation->Type)) {
-    auto [Sym, Addend] = handleGOTPC64(*Relocation, InstAddress);
+    auto PairOrErr = handleGOTPC64(*Relocation, InstAddress);
+    if (auto E = PairOrErr.takeError()) {
+      Function.setSimple(false);
+      BC.logBOLTErrorsAndQuitOnFatal(std::move(E));
+      return false;
+    }
+    auto [Sym, Addend] = *PairOrErr;
     addOperand(Sym, Addend);
     return true;
   }
@@ -158,14 +164,16 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   return true;
 }
 
-std::pair<MCSymbol *, uint64_t>
+Expected<std::pair<MCSymbol *, uint64_t>>
 X86MCSymbolizer::handleGOTPC64(const Relocation &R, uint64_t InstrAddr) {
   BinaryContext &BC = Function.getBinaryContext();
   const BinaryData *GOTSymBD = BC.getGOTSymbol();
   if (!GOTSymBD || !GOTSymBD->getAddress()) {
-    errs() << "BOLT-ERROR: R_X86_GOTPC64 relocation is present but we did "
-              "not detect a valid  _GLOBAL_OFFSET_TABLE_ in symbol table\n";
-    exit(1);
+    // This error is pretty serious but we can't kill the disassembler
+    // because of it, so don't make it fatal. Log it and warn the user.
+    return createNonFatalBOLTError(
+        "R_X86_GOTPC64 relocation is present but we did not detect "
+        "a valid  _GLOBAL_OFFSET_TABLE_ in symbol table\n");
   }
   // R_X86_GOTPC64 are not relative to the Reloc nor end of instruction,
   // but the start of the MOVABSQ instruction. So the Target Address is
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.h b/bolt/lib/Target/X86/X86MCSymbolizer.h
index 9ed18b69c74c..189941e949e3 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.h
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.h
@@ -20,8 +20,8 @@ protected:
   BinaryFunction &Function;
   bool CreateNewSymbols{true};
 
-  std::pair<MCSymbol *, uint64_t> handleGOTPC64(const Relocation &R,
-                                                uint64_t InstrAddr);
+  Expected<std::pair<MCSymbol *, uint64_t>> handleGOTPC64(const Relocation &R,
+                                                          uint64_t InstrAddr);
 
 public:
   X86MCSymbolizer(BinaryFunction &Function, bool CreateNewSymbols = true)
diff --git a/bolt/test/X86/fatal-error.s b/bolt/test/X86/fatal-error.s
new file mode 100644
index 000000000000..312d1d47429f
--- /dev/null
+++ b/bolt/test/X86/fatal-error.s
@@ -0,0 +1,39 @@
+# Tests whether llvm-bolt will correctly exit with error code and printing
+# fatal error message in case one occurs. Here we test opening a function
+# reordering file that does not exist.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: not llvm-bolt %t.exe -o %t.null \
+# RUN:   --reorder-blocks=normal --reorder-functions=user \
+# RUN:   --function-order=/DOES/NOT/EXIST  2>&1 \
+# RUN:   | FileCheck --check-prefix=CHECK %s
+
+# CHECK: FATAL BOLT-ERROR: Ordered functions file "/DOES/NOT/EXIST" can't be opened
+
+# Sample function reordering input, based off function-order-lite.s
+  .globl main
+  .type main, %function
+main:
+	.cfi_startproc
+.LBB06:
+	callq	func_a
+	retq
+	.cfi_endproc
+.size main, .-main
+
+  .globl func_a
+  .type func_a, %function
+func_a:
+	.cfi_startproc
+	retq
+	.cfi_endproc
+.size func_a, .-func_a
+
+  .globl func_b
+  .type func_b, %function
+func_b:
+	.cfi_startproc
+	retq
+	.cfi_endproc
+.size func_b, .-func_b
diff --git a/bolt/test/X86/log.test b/bolt/test/X86/log.test
new file mode 100644
index 000000000000..0cbb5b625d00
--- /dev/null
+++ b/bolt/test/X86/log.test
@@ -0,0 +1,19 @@
+# Tests whether llvm-bolt is able to redirect logs when processing a simple
+# input. If this test fails on your changes, please use BinaryContext::outs()
+# to print BOLT logging instead of llvm::outs().
+
+RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o %t.null --data %p/Inputs/blarge.fdata -v=2 \
+RUN:   --reorder-blocks=normal --print-finalized --log-file=%t.log  2>&1 \
+RUN:   | FileCheck --check-prefix=CHECK --allow-empty %s
+RUN: cat %t.log | FileCheck %s --check-prefix=CHECK-LOG
+
+CHECK-NOT: BOLT-INFO
+CHECK-NOT: BOLT-WARNING
+CHECK-NOT: BOLT-ERROR
+
+# Check some usual BOLT output lines are being redirected to the log file
+CHECK-LOG: BOLT-INFO: Target architecture
+CHECK-LOG: BOLT-INFO: BOLT version
+CHECK-LOG: BOLT-INFO: basic block reordering modified layout
+CHECK-LOG:    Binary Function "usqrt"
diff --git a/bolt/tools/bat-dump/bat-dump.cpp b/bolt/tools/bat-dump/bat-dump.cpp
index 71efe008d409..2e9b26cc137a 100644
--- a/bolt/tools/bat-dump/bat-dump.cpp
+++ b/bolt/tools/bat-dump/bat-dump.cpp
@@ -109,7 +109,7 @@ void dumpBATFor(llvm::object::ELFObjectFileBase *InputFile) {
     exit(1);
   }
 
-  if (std::error_code EC = BAT.parse(SectionContents)) {
+  if (std::error_code EC = BAT.parse(outs(), SectionContents)) {
     errs() << "BOLT-ERROR: failed to parse BOLT address translation "
               "table. Malformed BAT section\n";
     exit(1);
diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index cc215a5256d2..9b03524e9f18 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -63,6 +63,11 @@ BoltProfile("b",
   cl::aliasopt(InputDataFilename),
   cl::cat(BoltCategory));
 
+cl::opt<std::string>
+    LogFile("log-file",
+            cl::desc("redirect journaling to a file instead of stdout/stderr"),
+            cl::Hidden, cl::cat(BoltCategory));
+
 static cl::opt<std::string>
 InputDataFilename2("data2",
   cl::desc("<data file>"),
@@ -207,6 +212,24 @@ int main(int argc, char **argv) {
   if (!sys::fs::exists(opts::InputFilename))
     report_error(opts::InputFilename, errc::no_such_file_or_directory);
 
+  // Initialize journaling streams
+  raw_ostream *BOLTJournalOut = &outs();
+  raw_ostream *BOLTJournalErr = &errs();
+  // RAII obj to keep log file open throughout execution
+  std::unique_ptr<raw_fd_ostream> LogFileStream;
+  if (!opts::LogFile.empty()) {
+    std::error_code LogEC;
+    LogFileStream = std::make_unique<raw_fd_ostream>(
+        opts::LogFile, LogEC, sys::fs::OpenFlags::OF_None);
+    if (LogEC) {
+      errs() << "BOLT-ERROR: cannot open requested log file for writing: "
+             << LogEC.message() << "\n";
+      exit(1);
+    }
+    BOLTJournalOut = LogFileStream.get();
+    BOLTJournalErr = LogFileStream.get();
+  }
+
   // Attempt to open the binary.
   if (!opts::DiffOnly) {
     Expected<OwningBinary<Binary>> BinaryOrErr =
@@ -216,7 +239,8 @@ int main(int argc, char **argv) {
     Binary &Binary = *BinaryOrErr.get().getBinary();
 
     if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
-      auto RIOrErr = RewriteInstance::create(e, argc, argv, ToolPath);
+      auto RIOrErr = RewriteInstance::create(e, argc, argv, ToolPath,
+                                             *BOLTJournalOut, *BOLTJournalErr);
       if (Error E = RIOrErr.takeError())
         report_error(opts::InputFilename, std::move(E));
       RewriteInstance &RI = *RIOrErr.get();
diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp
index 7ac1c1435759..1fbb07bca966 100644
--- a/bolt/unittests/Core/BinaryContext.cpp
+++ b/bolt/unittests/Core/BinaryContext.cpp
@@ -40,7 +40,8 @@ protected:
 
   void initializeBOLT() {
     BC = cantFail(BinaryContext::createBinaryContext(
-        ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
+        ObjFile.get(), true, DWARFContext::create(*ObjFile.get()),
+        {llvm::outs(), llvm::errs()}));
     ASSERT_FALSE(!BC);
   }
 
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index b851c756e796..63448039c53e 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -50,7 +50,8 @@ protected:
 
   void initializeBolt() {
     BC = cantFail(BinaryContext::createBinaryContext(
-        ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
+        ObjFile.get(), true, DWARFContext::create(*ObjFile.get()),
+        {llvm::outs(), llvm::errs()}));
     ASSERT_FALSE(!BC);
     BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
         createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(),
diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
index 1d10348430c2..ac16803b4678 100644
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -133,7 +133,8 @@ public:
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef /*File*/, StringRef SearchPath,
                           StringRef /*RelativePath*/,
-                          const Module * /*Imported*/,
+                          const Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind /*FileType*/) override {
     if (auto FileEntry = SM.getFileEntryRefForID(SM.getFileID(HashLoc)))
       MoveTool->addIncludes(FileName, IsAngled, SearchPath,
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
index 5ecd4fb19131..5e2cc207560d 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
@@ -166,12 +166,12 @@ void ExpandModularHeadersPPCallbacks::InclusionDirective(
     SourceLocation DirectiveLoc, const Token &IncludeToken,
     StringRef IncludedFilename, bool IsAngled, CharSourceRange FilenameRange,
     OptionalFileEntryRef IncludedFile, StringRef SearchPath,
-    StringRef RelativePath, const Module *Imported,
+    StringRef RelativePath, const Module *SuggestedModule, bool ModuleImported,
     SrcMgr::CharacteristicKind FileType) {
-  if (Imported) {
+  if (ModuleImported) {
     serialization::ModuleFile *MF =
         Compiler.getASTReader()->getModuleManager().lookup(
-            *Imported->getASTFile());
+            *SuggestedModule->getASTFile());
     handleModuleFile(MF);
   }
   parseToLocation(DirectiveLoc);
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
index 3f6abc315e5b..0742c21bc437 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
@@ -69,7 +69,7 @@ private:
                           bool IsAngled, CharSourceRange FilenameRange,
                           OptionalFileEntryRef IncludedFile,
                           StringRef SearchPath, StringRef RelativePath,
-                          const Module *Imported,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override;
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
index 084e44a714d1..fb1e0e82a314 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
@@ -29,7 +29,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FileNameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override;
@@ -61,7 +62,7 @@ void KernelNameRestrictionCheck::registerPPCallbacks(const SourceManager &SM,
 void KernelNameRestrictionPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &, StringRef FileName, bool,
     CharSourceRange, OptionalFileEntryRef, StringRef, StringRef, const Module *,
-    SrcMgr::CharacteristicKind) {
+    bool, SrcMgr::CharacteristicKind) {
   IncludeDirective ID = {HashLoc, FileName};
   IncludeDirectives.push_back(std::move(ID));
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
index 61d89cf30813..09ba79f05575 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
@@ -26,7 +26,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -51,8 +52,8 @@ void SuspiciousIncludeCheck::registerPPCallbacks(
 void SuspiciousIncludePPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import)
     return;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
index 8ba8b893e03a..a73d46f01d9b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
@@ -82,10 +82,14 @@ void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) {
   // We are interested in only those cases when the loop bound is a variable
   // value (not const, enum, etc.).
   StatementMatcher LoopBoundMatcher =
-      expr(ignoringParenImpCasts(allOf(hasType(isInteger()),
-                                       unless(integerLiteral()),
-                                       unless(hasType(isConstQualified())),
-                                       unless(hasType(enumType())))))
+      expr(ignoringParenImpCasts(allOf(
+               hasType(isInteger()), unless(integerLiteral()),
+               unless(allOf(
+                   hasType(isConstQualified()),
+                   declRefExpr(to(varDecl(anyOf(
+                       hasInitializer(ignoringParenImpCasts(integerLiteral())),
+                       isConstexpr(), isConstinit())))))),
+               unless(hasType(enumType())))))
           .bind(LoopUpperBoundName);
 
   // We use the loop increment expression only to make sure we found the right
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index 1b763d291082..37baae7a6f0c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -60,6 +60,7 @@ void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
       varDecl(isLocalVarDecl(), unless(isReferenced()),
               unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
               unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
+              unless(hasAttr(attr::Kind::Unused)),
               hasType(hasUnqualifiedDesugaredType(
                   anyOf(recordType(hasDeclaration(namedDecl(
                             matchesAnyListedName(IncludeTypes),
diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
index bdd72f85e2a2..4246c8c574c5 100644
--- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
@@ -27,7 +27,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void EndOfMainFile() override;
 
@@ -81,8 +82,8 @@ static int getPriority(StringRef Filename, bool IsAngled, bool IsMainModule) {
 void IncludeOrderPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // We recognize the first include as a special main module header and want
   // to leave it in the top position.
   IncludeDirective ID = {HashLoc, FilenameRange, std::string(FileName),
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
index 3451d3474fd9..b656917071a6 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
@@ -33,7 +33,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -45,14 +46,14 @@ private:
 void RestrictedIncludesPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // Compiler provided headers are allowed (e.g stddef.h).
   if (SrcMgr::isSystem(FileType) && SearchPath == CompilerIncudeDir)
     return;
   portability::RestrictedIncludesPPCallbacks::InclusionDirective(
       HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath,
-      RelativePath, Imported, FileType);
+      RelativePath, SuggestedModule, ModuleImported, FileType);
 }
 
 void RestrictSystemLibcHeadersCheck::registerPPCallbacks(
diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
index bebd6e390ed5..fadfdc869d37 100644
--- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.cpp
@@ -83,7 +83,7 @@ public:
   void InclusionDirective(SourceLocation, const Token &, StringRef FilePath,
                           bool, CharSourceRange Range,
                           OptionalFileEntryRef File, StringRef, StringRef,
-                          const Module *,
+                          const Module *, bool,
                           SrcMgr::CharacteristicKind FileType) override {
     if (FileType != clang::SrcMgr::C_User)
       return;
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 030a781e2099..6a467910521f 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -32,7 +32,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
 private:
@@ -157,7 +158,7 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks(
             {"wctype.h", "cwctype"}})) {
     CStyledHeaderToCxx.insert(KeyValue);
   }
-  // Add C++ 11 headers.
+  // Add C++11 headers.
   if (LangOpts.CPlusPlus11) {
     for (const auto &KeyValue :
          std::vector<std::pair<llvm::StringRef, std::string>>(
@@ -178,8 +179,8 @@ IncludeModernizePPCallbacks::IncludeModernizePPCallbacks(
 void IncludeModernizePPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
 
   // If we don't want to warn for non-main file reports and this is one, skip
   // it.
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index b197c22dca41..0b47ed316ca2 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -117,7 +117,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     clearCurrentEnum(HashLoc);
   }
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
index 9ee0b4e6d3cc..db5693e3b7cb 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
@@ -21,8 +21,8 @@ namespace clang::tidy::portability {
 void RestrictedIncludesPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (!Check.contains(FileName) && SrcMgr::isSystem(FileType)) {
     SmallString<256> FullPath;
     llvm::sys::path::append(FullPath, SearchPath);
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
index ad18e6f411db..60fae5e73a60 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
@@ -50,7 +50,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void EndOfMainFile() override;
 
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index d1f41e0ec79e..67147164946a 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -47,7 +47,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void MacroDefined(const Token &MacroNameTok,
@@ -76,8 +77,8 @@ void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc,
 void DuplicateIncludeCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (llvm::is_contained(Files.back(), FileName)) {
     // We want to delete the entire line, so make sure that [Start,End] covers
     // everything.
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
index d0b7474992ab..b53016f331b7 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
@@ -25,7 +25,8 @@ public:
                           bool IsAngled, CharSourceRange FileNameRange,
                           OptionalFileEntryRef /*IncludedFile*/,
                           StringRef /*SearchPath*/, StringRef /*RelativePath*/,
-                          const Module * /*ImportedModule*/,
+                          const Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind /*FileType*/) override {
     Inserter->addInclude(FileNameRef, IsAngled, HashLocation,
                          IncludeToken.getEndLoc());
diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index 076e636e0e28..75f8668e7bef 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -41,7 +41,8 @@ public:
                           OptionalFileEntryRef File,
                           llvm::StringRef /*SearchPath*/,
                           llvm::StringRef /*RelativePath*/,
-                          const clang::Module * /*Imported*/,
+                          const clang::Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind FileKind) override {
     auto MainFID = SM.getMainFileID();
     // If an include is part of the preamble patch, translate #line directives.
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 14a91797f4d2..bbb0e2c77b3f 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -244,7 +244,7 @@ private:
                             SynthesizedFilenameTok.getEndLoc())
               .toCharRange(SM),
           File, "SearchPath", "RelPath",
-          /*Imported=*/nullptr, Inc.FileKind);
+          /*SuggestedModule=*/nullptr, /*ModuleImported=*/false, Inc.FileKind);
       if (File)
         Delegate->FileSkipped(*File, SynthesizedFilenameTok, Inc.FileKind);
     }
diff --git a/clang-tools-extra/clangd/index/IndexAction.cpp b/clang-tools-extra/clangd/index/IndexAction.cpp
index 5d56285a8396..ed56c2a9d2e8 100644
--- a/clang-tools-extra/clangd/index/IndexAction.cpp
+++ b/clang-tools-extra/clangd/index/IndexAction.cpp
@@ -89,7 +89,8 @@ public:
                           llvm::StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath, const Module *Imported,
+                          llvm::StringRef RelativePath,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     auto IncludeURI = toURI(File);
     if (!IncludeURI)
diff --git a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
index 472fe30ee46e..147d9abe6913 100644
--- a/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ReplayPeambleTests.cpp
@@ -72,7 +72,7 @@ struct ReplayPreamblePPCallback : public PPCallbacks {
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange, OptionalFileEntryRef,
-                          StringRef, StringRef, const clang::Module *,
+                          StringRef, StringRef, const clang::Module *, bool,
                           SrcMgr::CharacteristicKind) override {
     Includes.emplace_back(SM, HashLoc, IncludeTok, FileName, IsAngled,
                           FilenameRange);
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index e50914aed5f0..f2fba9aa1450 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -117,6 +117,14 @@ Changes in existing checks
   options `HeaderFileExtensions` and `ImplementationFileExtensions` by the
   global options of the same name.
 
+- Improved :doc:`bugprone-too-small-loop-variable
+  <clang-tidy/checks/bugprone/too-small-loop-variable>` check by incorporating
+  better support for ``const`` loop boundaries.
+
+- Improved :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
+  ignoring local variable with ``[maybe_unused]`` attribute.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
@@ -159,13 +167,13 @@ Changes in existing checks
 Removed checks
 ^^^^^^^^^^^^^^
 
-Miscellaneous
-^^^^^^^^^^^^^
-
 - Removed `cert-dcl21-cpp`, which was deprecated since :program:`clang-tidy` 17,
   since the rule DCL21-CPP has been removed from the CERT guidelines.
 
-- Fixed incorrect formatting in ``clang-apply-repalcements`` when no ``--format``
+Miscellaneous
+^^^^^^^^^^^^^
+
+- Fixed incorrect formatting in ``clang-apply-replacements`` when no ``--format``
   option is specified. Now ``clang-apply-replacements`` applies formatting only with
   the option.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
index 0f45cc2fe114..2c3ded952aa0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
@@ -28,6 +28,10 @@ In a real use case size means a container's size which depends on the user input
 This algorithm works for a small amount of objects, but will lead to freeze for
 a larger user input.
 
+It's recommended to enable the compiler warning
+`-Wtautological-constant-out-of-range-compare` as well, since check does not
+inspect compile-time constant loop boundaries to avoid overlaps with the warning.
+
 .. option:: MagnitudeBitsUpperLimit
 
   Upper limit for the magnitude bits of the loop variable. If it's set the check
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
index 7531f19f3ebc..9f283de78fbd 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -11,6 +11,7 @@ The following types of variables are excluded from this check:
 * exception variables in catch clauses
 * static or thread local
 * structured bindings
+* variables with ``[[maybe_unused]]`` attribute
 
 This check can be configured to warn on all non-trivial variables by setting
 `IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
index 974a56abd97d..298243fc3ced 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/deprecated-headers.rst
@@ -4,7 +4,7 @@ modernize-deprecated-headers
 ============================
 
 Some headers from C library were deprecated in C++ and are no longer welcome in
-C++ codebases. Some have no effect in C++. For more details refer to the C++ 14
+C++ codebases. Some have no effect in C++. For more details refer to the C++14
 Standard [depr.c.headers] section.
 
 This check replaces C standard library headers with their C++ alternatives and
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
index 0440ab855ea7..f8f34794af74 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-override.rst
@@ -10,7 +10,7 @@ removes ``virtual`` from those functions as it is not required.
 user that a function was virtual. C++ compilers did not use the presence of
 this to signify an overridden function.
 
-In C++ 11 ``override`` and ``final`` keywords were introduced to allow
+In C++11 ``override`` and ``final`` keywords were introduced to allow
 overridden functions to be marked appropriately. Their presence allows
 compilers to verify that an overridden function correctly overrides a base
 class implementation.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
index d802f9be829c..b07958188d31 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-return-with-void-value.rst
@@ -29,7 +29,7 @@ that should be written as
    g();
    return;
 
-to make clear that ``g()`` is called and immediately afterwards the function 
+to make clear that ``g()`` is called and immediately afterwards the function
 returns (nothing).
 
 In C, the same issue is detected by the compiler if the ``-Wpedantic`` mode
@@ -46,6 +46,6 @@ Options
 .. option::  StrictMode
 
   The value `false` specifies that a direct return statement shall
-  be excluded from the analysis if it is the only statement not 
-  contained in a block like ``if (cond) return g();``. The default
+  be excluded from the analysis if it is the only statement not
+  contained in a block, like ``if (cond) return g();``. The default
   value is `true`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
index 07d1e352d3b1..b28daecf7a2c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst
@@ -3,7 +3,7 @@
 readability-container-contains
 ==============================
 
-Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++ 20.
+Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++20.
 
 Whether an element is contained inside a container should be checked with ``contains`` instead of ``count``/``find`` because ``contains`` conveys the intent more clearly. Furthermore, for containers which permit multiple entries per key (``multimap``, ``multiset``, ...), ``contains`` is more efficient than ``count`` because ``count`` has to do unnecessary additional work.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
index f7bd9ff89345..6e5876627510 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/use-anyofallof.rst
@@ -4,7 +4,7 @@ readability-use-anyofallof
 ==========================
 
 Finds range-based for loops that can be replaced by a call to ``std::any_of`` or
-``std::all_of``. In C++ 20 mode, suggests ``std::ranges::any_of`` or
+``std::all_of``. In C++20 mode, suggests ``std::ranges::any_of`` or
 ``std::ranges::all_of``.
 
 Example:
diff --git a/clang-tools-extra/include-cleaner/lib/Record.cpp b/clang-tools-extra/include-cleaner/lib/Record.cpp
index c93c56adf650..78a4df6cc40e 100644
--- a/clang-tools-extra/include-cleaner/lib/Record.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Record.cpp
@@ -65,7 +65,8 @@ public:
                           StringRef SpelledFilename, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind) override {
     if (!Active)
       return;
@@ -214,7 +215,8 @@ public:
                           OptionalFileEntryRef File,
                           llvm::StringRef /*SearchPath*/,
                           llvm::StringRef /*RelativePath*/,
-                          const clang::Module * /*Imported*/,
+                          const clang::Module * /*SuggestedModule*/,
+                          bool /*ModuleImported*/,
                           SrcMgr::CharacteristicKind FileKind) override {
     FileID HashFID = SM.getFileID(HashLoc);
     int HashLine = SM.getLineNumber(HashFID, SM.getFileOffset(HashLoc));
diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index 1e8b0aa37ca3..0e76c539aa3c 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -90,7 +90,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     Checker.collectUmbrellaHeaderHeader(File->getName());
   }
diff --git a/clang-tools-extra/modularize/PreprocessorTracker.cpp b/clang-tools-extra/modularize/PreprocessorTracker.cpp
index 7557fb177ceb..85e3aab041e4 100644
--- a/clang-tools-extra/modularize/PreprocessorTracker.cpp
+++ b/clang-tools-extra/modularize/PreprocessorTracker.cpp
@@ -730,15 +730,14 @@ public:
   ~PreprocessorCallbacks() override {}
 
   // Overridden handlers.
-  void InclusionDirective(clang::SourceLocation HashLoc,
-                          const clang::Token &IncludeTok,
-                          llvm::StringRef FileName, bool IsAngled,
-                          clang::CharSourceRange FilenameRange,
-                          clang::OptionalFileEntryRef File,
-                          llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath,
-                          const clang::Module *Imported,
-                          clang::SrcMgr::CharacteristicKind FileType) override;
+  void
+  InclusionDirective(clang::SourceLocation HashLoc,
+                     const clang::Token &IncludeTok, llvm::StringRef FileName,
+                     bool IsAngled, clang::CharSourceRange FilenameRange,
+                     clang::OptionalFileEntryRef File,
+                     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
+                     const clang::Module *SuggestedModule, bool ModuleImported,
+                     clang::SrcMgr::CharacteristicKind FileType) override;
   void FileChanged(clang::SourceLocation Loc,
                    clang::PPCallbacks::FileChangeReason Reason,
                    clang::SrcMgr::CharacteristicKind FileType,
@@ -1275,7 +1274,8 @@ void PreprocessorCallbacks::InclusionDirective(
     llvm::StringRef FileName, bool IsAngled,
     clang::CharSourceRange FilenameRange, clang::OptionalFileEntryRef File,
     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
-    const clang::Module *Imported, clang::SrcMgr::CharacteristicKind FileType) {
+    const clang::Module *SuggestedModule, bool ModuleImported,
+    clang::SrcMgr::CharacteristicKind FileType) {
   int DirectiveLine, DirectiveColumn;
   std::string HeaderPath = getSourceLocationFile(PP, HashLoc);
   getSourceLocationLineAndColumn(PP, HashLoc, DirectiveLine, DirectiveColumn);
diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
index a59a8278682b..3bb30fd15b2e 100644
--- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
+++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp
@@ -135,7 +135,8 @@ void PPCallbacksTracker::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, llvm::StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
     llvm::StringRef SearchPath, llvm::StringRef RelativePath,
-    const Module *Imported, SrcMgr::CharacteristicKind FileType) {
+    const Module *SuggestedModule, bool ModuleImported,
+    SrcMgr::CharacteristicKind FileType) {
   beginCallback("InclusionDirective");
   appendArgument("HashLoc", HashLoc);
   appendArgument("IncludeTok", IncludeTok);
@@ -145,7 +146,8 @@ void PPCallbacksTracker::InclusionDirective(
   appendArgument("File", File);
   appendFilePathArgument("SearchPath", SearchPath);
   appendFilePathArgument("RelativePath", RelativePath);
-  appendArgument("Imported", Imported);
+  appendArgument("SuggestedModule", SuggestedModule);
+  appendArgument("ModuleImported", ModuleImported);
 }
 
 // Callback invoked whenever there was an explicit module-import
diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.h b/clang-tools-extra/pp-trace/PPCallbacksTracker.h
index c195a72b08c1..04590a919369 100644
--- a/clang-tools-extra/pp-trace/PPCallbacksTracker.h
+++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.h
@@ -95,7 +95,8 @@ public:
                           llvm::StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, llvm::StringRef SearchPath,
-                          llvm::StringRef RelativePath, const Module *Imported,
+                          llvm::StringRef RelativePath,
+                          const Module *SuggestedModule, bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
                     const Module *Imported) override;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
index 3229deb93bad..113150b16865 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/too-small-loop-variable.cpp
@@ -93,6 +93,18 @@ void voidBadForLoopWithMacroBound() {
   }
 }
 
+unsigned int getVal() {
+    return 300;
+}
+
+// The iteration's upper bound has a function declaration.
+void voidBadForLoop8() {
+  const unsigned int l = getVal();
+  for (unsigned char i = 0; i < l; ++i) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: loop variable has narrower type 'unsigned char' than iteration's upper bound 'const unsigned int' [bugprone-too-small-loop-variable]
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Correct loops: we should not warn here.
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
index 19f2344de4a6..3fdc24b94a6c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -77,6 +77,7 @@ T qux(T Generic) {
     // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'TemplateType' of type 'async::Future<T>' [bugprone-unused-local-non-trivial-variable]
     a::Future<T> AliasTemplateType;
     // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: unused local variable 'AliasTemplateType' of type 'a::Future<T>' (aka 'Future<type-parameter-0-0>') [bugprone-unused-local-non-trivial-variable]
+    [[maybe_unused]] async::Future<Units> MaybeUnused;
     return Generic;
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
index 547f634a101c..d0efc5ca7637 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/diagnostic.cpp
@@ -68,5 +68,6 @@ auto S<>::foo(auto)
 {
     return 1;
 }
-// CHECK8: error: template parameter list matching the non-templated nested type 'S<>' should be empty ('template<>') [clang-diagnostic-error]
+// CHECK8: error: conflicting types for 'foo' [clang-diagnostic-error]
+// CHECK8: note: previous declaration is here
 #endif
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
index db0b2c89430a..ea9896e1cfde 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
@@ -59,7 +59,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level1A.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}pp-trace"
 // CHECK-NEXT:   RelativePath: "Inputs/Level1A.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level1A.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -74,7 +75,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level2A.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}Inputs"
 // CHECK-NEXT:   RelativePath: "Level2A.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level2A.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -105,7 +107,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level1B.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}pp-trace"
 // CHECK-NEXT:   RelativePath: "Inputs/Level1B.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level1B.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
@@ -120,7 +123,8 @@
 // CHECK-NEXT:   File: "{{.*}}{{[/\\]}}Inputs/Level2B.h"
 // CHECK-NEXT:   SearchPath: "{{.*}}{{[/\\]}}Inputs"
 // CHECK-NEXT:   RelativePath: "Level2B.h"
-// CHECK-NEXT:   Imported: (null)
+// CHECK-NEXT:   SuggestedModule: (null)
+// CHECK-NEXT:   ModuleImported: false
 // CHECK-NEXT: - Callback: FileChanged
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}Inputs/Level2B.h:1:1"
 // CHECK-NEXT:   Reason: EnterFile
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0a8cc18c5b4c..fdf7bfaeaa4e 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1531,114 +1531,8 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlwaysBreakAfterReturnType:
 
-**AlwaysBreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlwaysBreakAfterReturnType>`
-  The function declaration return type breaking style to use.
-
-  Possible values:
-
-  * ``RTBS_None`` (in configuration: ``None``)
-    This is **deprecated**. See ``Automatic`` below.
-
-  * ``RTBS_Automatic`` (in configuration: ``Automatic``)
-    Break after return type based on ``PenaltyReturnTypeOnItsOwnLine``.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int f() { return 1; }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_ExceptShortType`` (in configuration: ``ExceptShortType``)
-    Same as ``Automatic`` above, except that there is no break after short
-    return types.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int f() { return 1; }
-      int LongName::
-          AnotherLongName();
-
-  * ``RTBS_All`` (in configuration: ``All``)
-    Always break after the return type.
-
-    .. code-block:: c++
-
-      class A {
-        int
-        f() {
-          return 0;
-        };
-      };
-      int
-      f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_TopLevel`` (in configuration: ``TopLevel``)
-    Always break after the return types of top-level functions.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int
-      f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_AllDefinitions`` (in configuration: ``AllDefinitions``)
-    Always break after the return type of function definitions.
-
-    .. code-block:: c++
-
-      class A {
-        int
-        f() {
-          return 0;
-        };
-      };
-      int f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-  * ``RTBS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
-    Always break after the return type of top-level definitions.
-
-    .. code-block:: c++
-
-      class A {
-        int f() { return 0; };
-      };
-      int f();
-      int
-      f() {
-        return 1;
-      }
-      int
-      LongName::AnotherLongName();
-
-
+**AlwaysBreakAfterReturnType** (``deprecated``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlwaysBreakAfterReturnType>`
+  This option is renamed to ``BreakAfterReturnType``.
 
 .. _AlwaysBreakBeforeMultilineStrings:
 
@@ -1659,62 +1553,8 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlwaysBreakTemplateDeclarations:
 
-**AlwaysBreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ <AlwaysBreakTemplateDeclarations>`
-  The template declaration breaking style to use.
-
-  Possible values:
-
-  * ``BTDS_Leave`` (in configuration: ``Leave``)
-    Do not change the line breaking before the declaration.
-
-    .. code-block:: c++
-
-       template <typename T>
-       T foo() {
-       }
-       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
-                                   int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_No`` (in configuration: ``No``)
-    Do not force break before declaration.
-    ``PenaltyBreakTemplateDeclaration`` is taken into account.
-
-    .. code-block:: c++
-
-       template <typename T> T foo() {
-       }
-       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
-                                   int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_MultiLine`` (in configuration: ``MultiLine``)
-    Force break after template declaration only when the following
-    declaration spans multiple lines.
-
-    .. code-block:: c++
-
-       template <typename T> T foo() {
-       }
-       template <typename T>
-       T foo(int aaaaaaaaaaaaaaaaaaaaa,
-             int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-  * ``BTDS_Yes`` (in configuration: ``Yes``)
-    Always break after template declaration.
-
-    .. code-block:: c++
-
-       template <typename T>
-       T foo() {
-       }
-       template <typename T>
-       T foo(int aaaaaaaaaaaaaaaaaaaaa,
-             int bbbbbbbbbbbbbbbbbbbbb) {
-       }
-
-
+**AlwaysBreakTemplateDeclarations** (``deprecated``) :versionbadge:`clang-format 3.4` :ref:`¶ <AlwaysBreakTemplateDeclarations>`
+  This option is renamed to ``BreakTemplateDeclarations``.
 
 .. _AttributeMacros:
 
@@ -2273,6 +2113,117 @@ the configuration (without a prefix: ``Auto``).
      @Mock
      DataLoad loader;
 
+.. _BreakAfterReturnType:
+
+**BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakAfterReturnType>`
+  The function declaration return type breaking style to use.
+
+  Possible values:
+
+  * ``RTBS_None`` (in configuration: ``None``)
+    This is **deprecated**. See ``Automatic`` below.
+
+  * ``RTBS_Automatic`` (in configuration: ``Automatic``)
+    Break after return type based on ``PenaltyReturnTypeOnItsOwnLine``.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int f() { return 1; }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_ExceptShortType`` (in configuration: ``ExceptShortType``)
+    Same as ``Automatic`` above, except that there is no break after short
+    return types.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int f() { return 1; }
+      int LongName::
+          AnotherLongName();
+
+  * ``RTBS_All`` (in configuration: ``All``)
+    Always break after the return type.
+
+    .. code-block:: c++
+
+      class A {
+        int
+        f() {
+          return 0;
+        };
+      };
+      int
+      f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_TopLevel`` (in configuration: ``TopLevel``)
+    Always break after the return types of top-level functions.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int
+      f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_AllDefinitions`` (in configuration: ``AllDefinitions``)
+    Always break after the return type of function definitions.
+
+    .. code-block:: c++
+
+      class A {
+        int
+        f() {
+          return 0;
+        };
+      };
+      int f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+  * ``RTBS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
+    Always break after the return type of top-level definitions.
+
+    .. code-block:: c++
+
+      class A {
+        int f() { return 0; };
+      };
+      int f();
+      int
+      f() {
+        return 1;
+      }
+      int
+      LongName::AnotherLongName();
+
+
+
 .. _BreakArrays:
 
 **BreakArrays** (``Boolean``) :versionbadge:`clang-format 16` :ref:`¶ <BreakArrays>`
@@ -3014,6 +2965,65 @@ the configuration (without a prefix: ``Auto``).
      string x =
          "veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongString";
 
+.. _BreakTemplateDeclarations:
+
+**BreakTemplateDeclarations** (``BreakTemplateDeclarationsStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakTemplateDeclarations>`
+  The template declaration breaking style to use.
+
+  Possible values:
+
+  * ``BTDS_Leave`` (in configuration: ``Leave``)
+    Do not change the line breaking before the declaration.
+
+    .. code-block:: c++
+
+       template <typename T>
+       T foo() {
+       }
+       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
+                                   int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_No`` (in configuration: ``No``)
+    Do not force break before declaration.
+    ``PenaltyBreakTemplateDeclaration`` is taken into account.
+
+    .. code-block:: c++
+
+       template <typename T> T foo() {
+       }
+       template <typename T> T foo(int aaaaaaaaaaaaaaaaaaaaa,
+                                   int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_MultiLine`` (in configuration: ``MultiLine``)
+    Force break after template declaration only when the following
+    declaration spans multiple lines.
+
+    .. code-block:: c++
+
+       template <typename T> T foo() {
+       }
+       template <typename T>
+       T foo(int aaaaaaaaaaaaaaaaaaaaa,
+             int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+  * ``BTDS_Yes`` (in configuration: ``Yes``)
+    Always break after template declaration.
+
+    .. code-block:: c++
+
+       template <typename T>
+       T foo() {
+       }
+       template <typename T>
+       T foo(int aaaaaaaaaaaaaaaaaaaaa,
+             int bbbbbbbbbbbbbbbbbbbbb) {
+       }
+
+
+
 .. _ColumnLimit:
 
 **ColumnLimit** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <ColumnLimit>`
@@ -4156,7 +4166,7 @@ the configuration (without a prefix: ``Auto``).
 
 .. _MainIncludeChar:
 
-**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 18` :ref:`¶ <MainIncludeChar>`
+**MainIncludeChar** (``MainIncludeCharDiscriminator``) :versionbadge:`clang-format 19` :ref:`¶ <MainIncludeChar>`
   When guessing whether a #include is the "main" include, only the include
   directives that use the specified character are considered.
 
diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst
index 6d7770b50e72..3bef55847573 100644
--- a/clang/docs/ClangLinkerWrapper.rst
+++ b/clang/docs/ClangLinkerWrapper.rst
@@ -79,6 +79,14 @@ linking is desired, simply do not run the binaries through the
 ``clang-linker-wrapper``. This will simply append the embedded device code so
 that it can be linked later.
 
+Matching
+========
+
+The linker wrapper will link extracted device code that is compatible with each
+other. Generally, this requires that the target triple and architecture match.
+An exception is made when the architecture is listed as ``generic``, which will
+cause it be linked with any other device code with the same target triple.
+
 Example
 =======
 
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index e91156837290..ee1d25396ca8 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2764,6 +2764,39 @@ Query for this feature with ``__has_builtin(__builtin_readcyclecounter)``. Note
 that even if present, its use may depend on run-time privilege or other OS
 controlled state.
 
+``__builtin_readsteadycounter``
+-------------------------------
+
+``__builtin_readsteadycounter`` is used to access the fixed frequency counter
+register (or a similar steady-rate clock) on those targets that support it.
+The function is similar to ``__builtin_readcyclecounter`` above except that the
+frequency is fixed, making it suitable for measuring elapsed time.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  __builtin_readsteadycounter()
+
+**Example of Use**:
+
+.. code-block:: c++
+
+  unsigned long long t0 = __builtin_readsteadycounter();
+  do_something();
+  unsigned long long t1 = __builtin_readsteadycounter();
+  unsigned long long secs_to_do_something = (t1 - t0) / tick_rate;
+
+**Description**:
+
+The ``__builtin_readsteadycounter()`` builtin returns the frequency counter value.
+When not supported by the target, the return value is always zero. This builtin
+takes no arguments and produces an unsigned long long result. The builtin does 
+not guarantee any particular frequency, only that it is stable. Knowledge of the 
+counter's true frequency will need to be provided by the user.
+
+Query for this feature with ``__has_builtin(__builtin_readsteadycounter)``.
+
 ``__builtin_dump_struct``
 -------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 52a48c750fe5..dc2fb3b25e3a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -117,6 +117,9 @@ C23 Feature Support
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
+- Added ``__builtin_readsteadycounter`` for reading fixed frequency hardware
+  counters.
+
 New Compiler Flags
 ------------------
 
@@ -149,7 +152,12 @@ Improvements to Clang's diagnostics
   prints.
 
 - Clang now diagnoses member template declarations with multiple declarators.
-- Clang now diagnoses use of the ``template`` keyword after declarative nested name specifiers.
+
+- Clang now diagnoses use of the ``template`` keyword after declarative nested
+  name specifiers.
+
+- The ``-Wshorten-64-to-32`` diagnostic is now grouped under ``-Wimplicit-int-conversion`` instead
+   of ``-Wconversion``. Fixes `#69444 <https://github.com/llvm/llvm-project/issues/69444>`_.
 
 Improvements to Clang's time-trace
 ----------------------------------
@@ -160,6 +168,9 @@ Bug Fixes in This Version
   a member class template for an implicit instantiation of a class template.
 
 - Fixed missing warnings when doing bool-like conversions in C23 (`#79435 <https://github.com/llvm/llvm-project/issues/79435>`_).
+- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as
+  a class field unless the lambda can capture this.
+  Fixes (`#71976 <https://github.com/llvm/llvm-project/issues/71976>`_)
 
 - Clang now accepts qualified partial/explicit specializations of variable templates that
   are not nominable in the lookup context of the specialization.
@@ -177,6 +188,10 @@ Bug Fixes to Attribute Support
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fix crash when calling the constructor of an invalid class.
+  Fixes (`#10518 <https://github.com/llvm/llvm-project/issues/10518>`_),
+  (`#67914 <https://github.com/llvm/llvm-project/issues/10518>`_),
+  and (`#78388 <https://github.com/llvm/llvm-project/issues/78388>`_)
 - Fix crash when using lifetimebound attribute in function with trailing return.
   Fixes (`#73619 <https://github.com/llvm/llvm-project/issues/73619>`_)
 - Addressed an issue where constraints involving injected class types are perceived
@@ -210,6 +225,14 @@ Bug Fixes to C++ Support
   Fixes (`#68490 <https://github.com/llvm/llvm-project/issues/68490>`_)
 - Fix a crash when trying to call a varargs function that also has an explicit object parameter.
   Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`)
+- Fixed a bug where abbreviated function templates would append their invented template parameters to
+  an empty template parameter lists.
+- Clang now classifies aggregate initialization in C++17 and newer as constant
+  or non-constant more accurately. Previously, only a subset of the initializer
+  elements were considered, misclassifying some initializers as constant. Fixes
+  some of (`#80510 <https://github.com/llvm/llvm-project/issues/80510>`).
+- Clang now ignores top-level cv-qualifiers on function parameters in template partial orderings.
+  (`#75404 <https://github.com/llvm/llvm-project/issues/75404>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -235,6 +258,8 @@ X86 Support
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fixed the incorrect definition of the __ARM_ARCH macro for architectures greater than or equal to v8.1.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
@@ -276,6 +301,11 @@ AST Matchers
 clang-format
 ------------
 
+- ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to
+  ``BreakTemplateDeclarations``.
+- ``AlwaysBreakAfterReturnType`` is deprecated and renamed to
+  ``BreakAfterReturnType``.
+
 libclang
 --------
 
diff --git a/clang/docs/ShadowCallStack.rst b/clang/docs/ShadowCallStack.rst
index 6e5192fd6723..d7ece11b3526 100644
--- a/clang/docs/ShadowCallStack.rst
+++ b/clang/docs/ShadowCallStack.rst
@@ -57,19 +57,25 @@ compiled application or the operating system. Integrating the runtime into
 the operating system should be preferred since otherwise all thread creation
 and destruction would need to be intercepted by the application.
 
-The instrumentation makes use of the platform register ``x18`` on AArch64 and
-``x3`` (``gp``) on RISC-V. For simplicity we will refer to this as the
-``SCSReg``. On some platforms, ``SCSReg`` is reserved, and on others, it is
-designated as a scratch register.  This generally means that any code that may
-run on the same thread as code compiled with ShadowCallStack must either target
-one of the platforms whose ABI reserves ``SCSReg`` (currently Android, Darwin,
-Fuchsia and Windows) or be compiled with a flag to reserve that register (e.g.,
-``-ffixed-x18``). If absolutely necessary, code compiled without reserving the
-register may be run on the same thread as code that uses ShadowCallStack by
-saving the register value temporarily on the stack (`example in Android`_) but
-this should be done with care since it risks leaking the shadow call stack
-address.
-
+The instrumentation makes use of the platform register ``x18`` on AArch64,
+``x3`` (``gp``) on RISC-V with software shadow stack and ``ssp`` on RISC-V with
+hardware shadow stack, which needs `Zicfiss`_ and ``-mno-forced-sw-shadow-stack``
+(default option). Note that with ``Zicfiss``_ the RISC-V backend will default to
+the hardware based shadow call stack. Users can force the RISC-V backend to
+generate the software shadow call stack with ``Zicfiss``_ by passing
+``-mforced-sw-shadow-stack``.
+For simplicity we will refer to this as the ``SCSReg``. On some platforms,
+``SCSReg`` is reserved, and on others, it is designated as a scratch register.
+This generally means that any code that may run on the same thread as code
+compiled with ShadowCallStack must either target one of the platforms whose ABI
+reserves ``SCSReg`` (currently Android, Darwin, Fuchsia and Windows) or be
+compiled with a flag to reserve that register (e.g., ``-ffixed-x18``). If
+absolutely necessary, code compiled without reserving the register may be run on
+the same thread as code that uses ShadowCallStack by saving the register value
+temporarily on the stack (`example in Android`_) but this should be done with
+care since it risks leaking the shadow call stack address.
+
+.. _`Zicfiss`: https://github.com/riscv/riscv-cfi/blob/main/cfi_backward.adoc
 .. _`example in Android`: https://android-review.googlesource.com/c/platform/frameworks/base/+/803717
 
 Because it requires a dedicated register, the ShadowCallStack feature is
@@ -151,9 +157,13 @@ Usage
 
 To enable ShadowCallStack, just pass the ``-fsanitize=shadow-call-stack`` flag
 to both compile and link command lines. On aarch64, you also need to pass
-``-ffixed-x18`` unless your target already reserves ``x18``. On RISC-V, ``x3``
-(``gp``) is always reserved. It is, however, important to disable GP relaxation
-in the linker. This can be done with the ``--no-relax-gp`` flag in GNU ld.
+``-ffixed-x18`` unless your target already reserves ``x18``. No additional flags
+need to be passed on RISC-V because the software based shadow stack uses
+``x3`` (``gp``), which is always reserved, and the hardware based shadow call
+stack uses a dedicated register, ``ssp``.
+However, it is important to disable GP relaxation in the linker when using the
+software based shadow call stack on RISC-V. This can be done with the
+``--no-relax-gp`` flag in GNU ld, and is off by default in LLD.
 
 Low-level API
 -------------
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index bb637cf1b800..510629d8a2d4 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1890,28 +1890,6 @@ the locking/unlocking of ``mtx_t`` mutexes.
    mtx_lock(&mtx1); // warn: This lock has already been acquired
  }
 
-.. _alpha-core-CallAndMessageUnInitRefArg:
-
-alpha.core.CallAndMessageUnInitRefArg (C,C++, ObjC)
-"""""""""""""""""""""""""""""""""""""""""""""""""""
-Check for logical errors for function calls and Objective-C
-message expressions (e.g., uninitialized arguments, null function pointers, and pointer to undefined variables).
-
-.. code-block:: c
-
- void test(void) {
-   int t;
-   int &p = t;
-   int &s = p;
-   int &q = s;
-   foo(q); // warn
- }
-
- void test(void) {
-   int x;
-   foo(&x); // warn
- }
-
 .. _alpha-core-CastSize:
 
 alpha.core.CastSize (C)
diff --git a/clang/docs/tools/dump_format_style.py b/clang/docs/tools/dump_format_style.py
index e41891f07de2..af0124b94eca 100755
--- a/clang/docs/tools/dump_format_style.py
+++ b/clang/docs/tools/dump_format_style.py
@@ -308,6 +308,7 @@ class OptionsReader:
         enum = None
         nested_struct = None
         version = None
+        deprecated = False
 
         for line in self.header:
             self.lineno += 1
@@ -327,6 +328,8 @@ class OptionsReader:
                     match = re.match(r"/// \\version\s*(?P<version>[0-9.]+)*", line)
                     if match:
                         version = match.group("version")
+                elif line.startswith("/// @deprecated"):
+                    deprecated = True
                 elif line.startswith("///"):
                     comment += self.__clean_comment_line(line)
                 elif line.startswith("enum"):
@@ -345,6 +348,9 @@ class OptionsReader:
                     field_type, field_name = re.match(
                         r"([<>:\w(,\s)]+)\s+(\w+);", line
                     ).groups()
+                    if deprecated:
+                        field_type = "deprecated"
+                        deprecated = False
 
                     if not version:
                         self.__warning(f"missing version for {field_name}", line)
@@ -456,6 +462,7 @@ class OptionsReader:
                 "std::vector<IncludeCategory>",
                 "std::vector<RawStringFormat>",
                 "std::optional<unsigned>",
+                "deprecated",
             ]:
                 if option.type in enums:
                     option.enum = enums[option.type]
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 6af41424ba89..3f3620609b6d 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2145,7 +2145,11 @@ enum CXCursorKind {
    */
   CXCursor_OMPScopeDirective = 306,
 
-  CXCursor_LastStmt = CXCursor_OMPScopeDirective,
+  /** OpenACC Compute Construct.
+   */
+  CXCursor_OpenACCComputeConstruct = 320,
+
+  CXCursor_LastStmt = CXCursor_OpenACCComputeConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 1d116becf06c..93bb045d6a66 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -55,16 +55,20 @@ public:
   std::string UnavailableMsg;
 
   /// Whether this entity is marked unavailable.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Unavailable : 1;
 
   /// Whether this entity is marked unavailable in Swift.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned UnavailableInSwift : 1;
 
 private:
   /// Whether SwiftPrivate was specified.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftPrivateSpecified : 1;
 
   /// Whether this entity is considered "private" to a Swift overlay.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftPrivate : 1;
 
 public:
@@ -191,18 +195,25 @@ inline bool operator!=(const CommonTypeInfo &LHS, const CommonTypeInfo &RHS) {
 /// Describes API notes data for an Objective-C class or protocol.
 class ObjCContextInfo : public CommonTypeInfo {
   /// Whether this class has a default nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasDefaultNullability : 1;
 
   /// The default nullability.
+  LLVM_PREFERRED_TYPE(NullabilityKind)
   unsigned DefaultNullability : 2;
 
   /// Whether this class has designated initializers recorded.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasDesignatedInits : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsNonGenericSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsNonGeneric : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftObjCMembersSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftObjCMembers : 1;
 
 public:
@@ -298,10 +309,12 @@ inline bool operator!=(const ObjCContextInfo &LHS, const ObjCContextInfo &RHS) {
 /// API notes for a variable/property.
 class VariableInfo : public CommonEntityInfo {
   /// Whether this property has been audited for nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NullabilityAudited : 1;
 
   /// The kind of nullability for this property. Only valid if the nullability
   /// has been audited.
+  LLVM_PREFERRED_TYPE(NullabilityKind)
   unsigned Nullable : 2;
 
   /// The C type of the variable, as a string.
@@ -352,7 +365,9 @@ inline bool operator!=(const VariableInfo &LHS, const VariableInfo &RHS) {
 
 /// Describes API notes data for an Objective-C property.
 class ObjCPropertyInfo : public VariableInfo {
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsAccessorsSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SwiftImportAsAccessors : 1;
 
 public:
@@ -409,9 +424,11 @@ inline bool operator!=(const ObjCPropertyInfo &LHS,
 /// Describes a function or method parameter.
 class ParamInfo : public VariableInfo {
   /// Whether noescape was specified.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoEscapeSpecified : 1;
 
   /// Whether the this parameter has the 'noescape' attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoEscape : 1;
 
   /// A biased RetainCountConventionKind, where 0 means "unspecified".
@@ -488,6 +505,7 @@ public:
   // unknown nullability.
 
   /// Whether the signature has been audited with respect to nullability.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NullabilityAudited : 1;
 
   /// Number of types whose nullability is encoded with the NullabilityPayload.
@@ -597,9 +615,11 @@ inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) {
 class ObjCMethodInfo : public FunctionInfo {
 public:
   /// Whether this is a designated initializer of its class.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DesignatedInit : 1;
 
   /// Whether this is a required initializer.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RequiredInit : 1;
 
   ObjCMethodInfo() : DesignatedInit(false), RequiredInit(false) {}
@@ -650,7 +670,9 @@ public:
 
 /// Describes API notes data for a tag.
 class TagInfo : public CommonTypeInfo {
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasFlagEnum : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFlagEnum : 1;
 
 public:
diff --git a/clang/include/clang/AST/CommentCommandTraits.h b/clang/include/clang/AST/CommentCommandTraits.h
index 83a29a540d42..0c3254d84eb0 100644
--- a/clang/include/clang/AST/CommentCommandTraits.h
+++ b/clang/include/clang/AST/CommentCommandTraits.h
@@ -50,52 +50,65 @@ struct CommandInfo {
   unsigned NumArgs : 4;
 
   /// True if this command is a inline command (of any kind).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsInlineCommand : 1;
 
   /// True if this command is a block command (of any kind).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBlockCommand : 1;
 
   /// True if this command is introducing a brief documentation
   /// paragraph (\or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBriefCommand : 1;
 
   /// True if this command is \\returns or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsReturnsCommand : 1;
 
   /// True if this command is introducing documentation for a function
   /// parameter (\\param or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsParamCommand : 1;
 
   /// True if this command is introducing documentation for
   /// a template parameter (\\tparam or an alias).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsTParamCommand : 1;
 
   /// True if this command is \\throws or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsThrowsCommand : 1;
 
   /// True if this command is \\deprecated or an alias.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsDeprecatedCommand : 1;
 
   /// True if this is a \\headerfile-like command.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsHeaderfileCommand : 1;
 
   /// True if we don't want to warn about this command being passed an empty
   /// paragraph.  Meaningful only for block commands.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsEmptyParagraphAllowed : 1;
 
   /// True if this command is a verbatim-like block command.
   ///
   /// A verbatim-like block command eats every character (except line starting
   /// decorations) until matching end command is seen or comment end is hit.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimBlockCommand : 1;
 
   /// True if this command is an end command for a verbatim-like block.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimBlockEndCommand : 1;
 
   /// True if this command is a verbatim line command.
   ///
   /// A verbatim-like line command eats everything until a newline is seen or
   /// comment end is hit.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVerbatimLineCommand : 1;
 
   /// True if this command contains a declaration for the entity being
@@ -105,20 +118,25 @@ struct CommandInfo {
   /// \code
   ///   \fn void f(int a);
   /// \endcode
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsDeclarationCommand : 1;
 
   /// True if verbatim-like line command is a function declaration.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFunctionDeclarationCommand : 1;
 
   /// True if block command is further describing a container API; such
   /// as \@coclass, \@classdesign, etc.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsRecordLikeDetailCommand : 1;
 
   /// True if block command is a container API; such as \@interface.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsRecordLikeDeclarationCommand : 1;
 
   /// True if this command is unknown.  This \c CommandInfo object was
   /// created during parsing.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnknownCommand : 1;
 };
 
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index f26fb5ad5f13..61117cc5ce71 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2615,10 +2615,18 @@ public:
   /// the target functionality.
   bool isTargetMultiVersion() const;
 
+  /// True if this function is the default version of a multiversioned dispatch
+  /// function as a part of the target functionality.
+  bool isTargetMultiVersionDefault() const;
+
   /// True if this function is a multiversioned dispatch function as a part of
   /// the target-clones functionality.
   bool isTargetClonesMultiVersion() const;
 
+  /// True if this function is a multiversioned dispatch function as a part of
+  /// the target-version functionality.
+  bool isTargetVersionMultiVersion() const;
+
   /// \brief Get the associated-constraints of this function declaration.
   /// Currently, this will either be a vector of size 1 containing the
   /// trailing-requires-clause or an empty vector.
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index baf71145d99d..e3b6a7efb112 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -134,6 +134,7 @@ public:
   const_iterator end() const { return begin() + NumParams; }
 
   unsigned size() const { return NumParams; }
+  bool empty() const { return NumParams == 0; }
 
   ArrayRef<NamedDecl *> asArray() { return llvm::ArrayRef(begin(), end()); }
   ArrayRef<const NamedDecl*> asArray() const {
diff --git a/clang/include/clang/AST/RawCommentList.h b/clang/include/clang/AST/RawCommentList.h
index 53aae24fa7bb..3e4567b546a7 100644
--- a/clang/include/clang/AST/RawCommentList.h
+++ b/clang/include/clang/AST/RawCommentList.h
@@ -175,17 +175,22 @@ private:
   mutable StringRef RawText;
   mutable const char *BriefText = nullptr;
 
-  mutable bool RawTextValid : 1;   ///< True if RawText is valid
-  mutable bool BriefTextValid : 1; ///< True if BriefText is valid
+  LLVM_PREFERRED_TYPE(bool)
+  mutable unsigned RawTextValid : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  mutable unsigned BriefTextValid : 1;
 
   LLVM_PREFERRED_TYPE(CommentKind)
   unsigned Kind : 3;
 
   /// True if comment is attached to a declaration in ASTContext.
-  bool IsAttached : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsAttached : 1;
 
-  bool IsTrailingComment : 1;
-  bool IsAlmostTrailingComment : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsTrailingComment : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsAlmostTrailingComment : 1;
 
   /// Constructor for AST deserialization.
   RawComment(SourceRange SR, CommentKind K, bool IsTrailingComment,
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 9da5206a21c3..5080551ada4f 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -34,6 +34,7 @@
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
@@ -505,6 +506,9 @@ private:
   bool VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *Node);
 
   bool PostVisitStmt(Stmt *S);
+  bool TraverseOpenACCConstructStmt(OpenACCConstructStmt *S);
+  bool
+  TraverseOpenACCAssociatedStmtConstruct(OpenACCAssociatedStmtConstruct *S);
 };
 
 template <typename Derived>
@@ -3910,6 +3914,24 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXBareClause(OMPXBareClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::TraverseOpenACCConstructStmt(
+    OpenACCConstructStmt *) {
+  // TODO OpenACC: When we implement clauses, ensure we traverse them here.
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::TraverseOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  TRY_TO(TraverseOpenACCConstructStmt(S));
+  TRY_TO(TraverseStmt(S->getAssociatedStmt()));
+  return true;
+}
+
+DEF_TRAVERSE_STMT(OpenACCComputeConstruct,
+                  { TRY_TO(TraverseOpenACCAssociatedStmtConstruct(S)); })
+
 // FIXME: look at the following tricky-seeming exprs to see if we
 // need to recurse on anything.  These are ones that have methods
 // returning decls or qualtypes or nestednamespecifier -- though I'm
diff --git a/clang/include/clang/AST/StmtObjC.h b/clang/include/clang/AST/StmtObjC.h
index c46ff4634c82..03bc61f54cdf 100644
--- a/clang/include/clang/AST/StmtObjC.h
+++ b/clang/include/clang/AST/StmtObjC.h
@@ -177,7 +177,8 @@ class ObjCAtTryStmt final
   unsigned NumCatchStmts : 16;
 
   // Whether this statement has a \@finally statement.
-  bool HasFinally : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned HasFinally : 1;
 
   /// Retrieve the statements that are stored after this \@try statement.
   ///
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
new file mode 100644
index 000000000000..9424f4f08078
--- /dev/null
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -0,0 +1,142 @@
+//===- StmtOpenACC.h - Classes for OpenACC directives  ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines OpenACC AST classes for statement-level contructs.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_STMTOPENACC_H
+#define LLVM_CLANG_AST_STMTOPENACC_H
+
+#include "clang/AST/Stmt.h"
+#include "clang/Basic/OpenACCKinds.h"
+#include "clang/Basic/SourceLocation.h"
+
+namespace clang {
+/// This is the base class for an OpenACC statement-level construct, other
+/// construct types are expected to inherit from this.
+class OpenACCConstructStmt : public Stmt {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  /// The directive kind. Each implementation of this interface should handle
+  /// specific kinds.
+  OpenACCDirectiveKind Kind = OpenACCDirectiveKind::Invalid;
+  /// The location of the directive statement, from the '#' to the last token of
+  /// the directive.
+  SourceRange Range;
+
+  // TODO OPENACC: Clauses should probably be collected in this class.
+
+protected:
+  OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K,
+                       SourceLocation Start, SourceLocation End)
+      : Stmt(SC), Kind(K), Range(Start, End) {}
+
+public:
+  OpenACCDirectiveKind getDirectiveKind() const { return Kind; }
+
+  static bool classof(const Stmt *S) {
+    return S->getStmtClass() >= firstOpenACCConstructStmtConstant &&
+           S->getStmtClass() <= lastOpenACCConstructStmtConstant;
+  }
+
+  SourceLocation getBeginLoc() const { return Range.getBegin(); }
+  SourceLocation getEndLoc() const { return Range.getEnd(); }
+
+  child_range children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_cast<OpenACCConstructStmt *>(this)->children();
+  }
+};
+
+/// This is a base class for any OpenACC statement-level constructs that have an
+/// associated statement. This class is not intended to be instantiated, but is
+/// a convenient place to hold the associated statement.
+class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  template <typename Derived> friend class RecursiveASTVisitor;
+  Stmt *AssociatedStmt = nullptr;
+
+protected:
+  OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K,
+                                 SourceLocation Start, SourceLocation End)
+      : OpenACCConstructStmt(SC, K, Start, End) {}
+
+  void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; }
+  Stmt *getAssociatedStmt() { return AssociatedStmt; }
+  const Stmt *getAssociatedStmt() const {
+    return const_cast<OpenACCAssociatedStmtConstruct *>(this)
+        ->getAssociatedStmt();
+  }
+
+public:
+  child_range children() {
+    if (getAssociatedStmt())
+      return child_range(&AssociatedStmt, &AssociatedStmt + 1);
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    return const_cast<OpenACCAssociatedStmtConstruct *>(this)->children();
+  }
+};
+/// This class represents a compute construct, representing a 'Kind' of
+/// `parallel', 'serial', or 'kernel'. These constructs are associated with a
+/// 'structured block', defined as:
+///
+///  in C or C++, an executable statement, possibly compound, with a single
+///  entry at the top and a single exit at the bottom
+///
+/// At the moment there is no real motivation to have a different AST node for
+/// those three, as they are semantically identical, and have only minor
+/// differences in the permitted list of clauses, which can be differentiated by
+/// the 'Kind'.
+class OpenACCComputeConstruct : public OpenACCAssociatedStmtConstruct {
+  friend class ASTStmtWriter;
+  friend class ASTStmtReader;
+  friend class ASTContext;
+  OpenACCComputeConstruct()
+      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass,
+                                       OpenACCDirectiveKind::Invalid,
+                                       SourceLocation{}, SourceLocation{}) {}
+
+  OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start,
+                          SourceLocation End)
+      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
+                                       End) {
+    assert((K == OpenACCDirectiveKind::Parallel ||
+            K == OpenACCDirectiveKind::Serial ||
+            K == OpenACCDirectiveKind::Kernels) &&
+           "Only parallel, serial, and kernels constructs should be "
+           "represented by this type");
+  }
+
+  void setStructuredBlock(Stmt *S) { setAssociatedStmt(S); }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCComputeConstructClass;
+  }
+
+  static OpenACCComputeConstruct *CreateEmpty(const ASTContext &C, EmptyShell);
+  static OpenACCComputeConstruct *Create(const ASTContext &C,
+                                         OpenACCDirectiveKind K,
+                                         SourceLocation BeginLoc,
+                                         SourceLocation EndLoc);
+
+  Stmt *getStructuredBlock() { return getAssociatedStmt(); }
+  const Stmt *getStructuredBlock() const {
+    return const_cast<OpenACCComputeConstruct *>(this)->getStructuredBlock();
+  }
+};
+} // namespace clang
+#endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index 621643391535..3cb3c1014d73 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2974,6 +2974,7 @@ class OMPAtomicDirective : public OMPExecutableDirective {
     /// This field is 1 for the first form of the expression and 0 for the
     /// second. Required for correct codegen of non-associative operations (like
     /// << or >>).
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsXLHSInRHSPart : 1;
     /// Used for 'atomic update' or 'atomic capture' constructs. They may
     /// have atomic expressions of forms:
@@ -2983,9 +2984,11 @@ class OMPAtomicDirective : public OMPExecutableDirective {
     /// \endcode
     /// This field is 1 for the first(postfix) form of the expression and 0
     /// otherwise.
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsPostfixUpdate : 1;
     /// 1 if 'v' is updated only when the condition is false (compare capture
     /// only).
+    LLVM_PREFERRED_TYPE(bool)
     uint8_t IsFailOnly : 1;
   } Flags;
 
diff --git a/clang/include/clang/AST/StmtVisitor.h b/clang/include/clang/AST/StmtVisitor.h
index 3e5155199eac..990aa2df180d 100644
--- a/clang/include/clang/AST/StmtVisitor.h
+++ b/clang/include/clang/AST/StmtVisitor.h
@@ -13,13 +13,14 @@
 #ifndef LLVM_CLANG_AST_STMTVISITOR_H
 #define LLVM_CLANG_AST_STMTVISITOR_H
 
-#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 3c4283f657ef..de67f0b57148 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -401,6 +401,7 @@ public:
   void
   VisitLifetimeExtendedTemporaryDecl(const LifetimeExtendedTemporaryDecl *D);
   void VisitHLSLBufferDecl(const HLSLBufferDecl *D);
+  void VisitOpenACCConstructStmt(const OpenACCConstructStmt *S);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index aca1ad998822..5d16dcc824c5 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -42,6 +42,43 @@ public:
   virtual VarGrpRef getGroupOfParms() const =0;
 };
 
+// FixitStrategy is a map from variables to the way we plan to emit fixes for
+// these variables. It is figured out gradually by trying different fixes
+// for different variables depending on gadgets in which these variables
+// participate.
+class FixitStrategy {
+public:
+  enum class Kind {
+    Wontfix,  // We don't plan to emit a fixit for this variable.
+    Span,     // We recommend replacing the variable with std::span.
+    Iterator, // We recommend replacing the variable with std::span::iterator.
+    Array,    // We recommend replacing the variable with std::array.
+    Vector    // We recommend replacing the variable with std::vector.
+  };
+
+private:
+  using MapTy = llvm::DenseMap<const VarDecl *, Kind>;
+
+  MapTy Map;
+
+public:
+  FixitStrategy() = default;
+  FixitStrategy(const FixitStrategy &) = delete; // Let's avoid copies.
+  FixitStrategy &operator=(const FixitStrategy &) = delete;
+  FixitStrategy(FixitStrategy &&) = default;
+  FixitStrategy &operator=(FixitStrategy &&) = default;
+
+  void set(const VarDecl *VD, Kind K) { Map[VD] = K; }
+
+  Kind lookup(const VarDecl *VD) const {
+    auto I = Map.find(VD);
+    if (I == Map.end())
+      return Kind::Wontfix;
+
+    return I->second;
+  }
+};
+
 /// The interface that lets the caller handle unsafe buffer usage analysis
 /// results by overriding this class's handle... methods.
 class UnsafeBufferUsageHandler {
@@ -75,9 +112,11 @@ public:
   ///
   /// `D` is the declaration of the callable under analysis that owns `Variable`
   /// and all of its group mates.
-  virtual void handleUnsafeVariableGroup(const VarDecl *Variable,
-                                         const VariableGroupsManager &VarGrpMgr,
-                                         FixItList &&Fixes, const Decl *D) = 0;
+  virtual void
+  handleUnsafeVariableGroup(const VarDecl *Variable,
+                            const VariableGroupsManager &VarGrpMgr,
+                            FixItList &&Fixes, const Decl *D,
+                            const FixitStrategy &VarTargetTypes) = 0;
 
 #ifndef NDEBUG
 public:
diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h
index 9f776ca6cc26..a7ff38c786a8 100644
--- a/clang/include/clang/Analysis/CFG.h
+++ b/clang/include/clang/Analysis/CFG.h
@@ -879,6 +879,7 @@ private:
   ///
   /// Optimization Note: This bit could be profitably folded with Terminator's
   /// storage if the memory usage of CFGBlock becomes an issue.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasNoReturnElement : 1;
 
   /// The parent CFG that owns this CFGBlock.
@@ -1007,7 +1008,9 @@ public:
 
   class FilterOptions {
   public:
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IgnoreNullPredecessors : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IgnoreDefaultsWithCoveredEnums : 1;
 
     FilterOptions()
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 20e45cc27b01..98bdf037880a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -100,6 +100,8 @@ public:
   /// to add to a `RecordStorageLocation` of a given type.
   /// Typically, this is called from the constructor of a `DataflowAnalysis`
   ///
+  /// The field types returned by the callback may not have reference type.
+  ///
   /// To maintain the invariant that all `RecordStorageLocation`s of a given
   /// type have the same fields:
   /// *  The callback must always return the same result for a given type
@@ -205,8 +207,17 @@ public:
   /// type.
   llvm::StringMap<QualType> getSyntheticFields(QualType Type) {
     assert(Type->isRecordType());
-    if (SyntheticFieldCallback)
-      return SyntheticFieldCallback(Type);
+    if (SyntheticFieldCallback) {
+      llvm::StringMap<QualType> Result = SyntheticFieldCallback(Type);
+      // Synthetic fields are not allowed to have reference type.
+      assert([&Result] {
+        for (const auto &Entry : Result)
+          if (Entry.getValue()->isReferenceType())
+            return false;
+        return true;
+      }());
+      return Result;
+    }
     return {};
   }
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 5c737a561a7c..0aecc749bf41 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -681,6 +681,14 @@ private:
                                           llvm::DenseSet<QualType> &Visited,
                                           int Depth, int &CreatedValuesCount);
 
+  /// Initializes the fields (including synthetic fields) of `Loc` with values,
+  /// unless values of the field type are not supported or we hit one of the
+  /// limits at which we stop producing values (controlled by `Visited`,
+  /// `Depth`, and `CreatedValuesCount`).
+  void initializeFieldsWithValues(RecordStorageLocation &Loc,
+                                  llvm::DenseSet<QualType> &Visited, int Depth,
+                                  int &CreatedValuesCount);
+
   /// Shared implementation of `createObject()` overloads.
   /// `D` and `InitExpr` may be null.
   StorageLocation &createObjectInternal(const ValueDecl *D, QualType Ty,
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b2d5309e142c..45a29e771f2a 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3226,8 +3226,8 @@ def TypeVisibility : InheritableAttr {
   let Args = [EnumArgument<"Visibility", "VisibilityType",
                            ["default", "hidden", "internal", "protected"],
                            ["Default", "Hidden", "Hidden", "Protected"]>];
-//  let Subjects = [Tag, ObjCInterface, Namespace];
-  let Documentation = [Undocumented];
+  // let Subjects = SubjectList<[Tag, ObjCInterface, Namespace], ErrorDiag>;
+  let Documentation = [TypeVisibilityDocs];
 }
 
 def VecReturn : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 041786f37fb8..8d369091d215 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2517,6 +2517,14 @@ function it instructs compiler to emit multiple function versions based on
 priority and target features availability. One of the versions is always
 ( implicitly or explicitly ) the ``default`` (fallback). Attribute strings can
 contain dependent features names joined by the "+" sign.
+
+For targets that support the GNU indirect function (IFUNC) feature, dispatch
+is performed by emitting an indirect function that is resolved to the appropriate
+target clone at load time. The indirect function is given the name the
+multiversioned function would have if it had been declared without the attribute.
+For backward compatibility with earlier Clang releases, a function alias with an
+``.ifunc`` suffix is also emitted. The  ``.ifunc`` suffixed symbol is a deprecated
+feature and support for it may be removed in the future.
 }];
 }
 
@@ -5577,6 +5585,25 @@ See :doc:`LTOVisibility`.
   }];
 }
 
+def TypeVisibilityDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The ``type_visibility`` attribute allows the visibility of a type and its vague
+linkage objects (vtable, typeinfo, typeinfo name) to be controlled separately from
+the visibility of functions and data members of the type.
+
+For example, this can be used to give default visibility to the typeinfo and the vtable
+of a type while still keeping hidden visibility on its member functions and static data
+members.
+
+This attribute can only be applied to types and namespaces.
+
+If both ``visibility`` and ``type_visibility`` are applied to a type or a namespace, the
+visibility specified with the ``type_visibility`` attribute overrides the visibility
+provided with the regular ``visibility`` attribute.
+  }];
+}
+
 def RenderScriptKernelAttributeDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 31a2bdeb2d3e..193d5851f9f2 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1110,6 +1110,12 @@ def ReadCycleCounter : Builtin {
   let Prototype = "unsigned long long int()";
 }
 
+def ReadSteadyCounter : Builtin {
+  let Spellings = ["__builtin_readsteadycounter"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned long long int()";
+}
+
 def Trap : Builtin {
   let Spellings = ["__builtin_trap"];
   let Attributes = [NoThrow, NoReturn];
diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index b65b41be0326..bfccff5600dd 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -87,7 +87,9 @@ class CustomEntry {
 }
 
 class AtomicBuiltin : Builtin;
-class TargetBuiltin : Builtin;
+class TargetBuiltin : Builtin {
+  string Features = "";
+}
 
 class LibBuiltin<string header, string languages = "ALL_LANGUAGES"> : Builtin {
   string Header = header;
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 7819e71d7fe2..8d3c5e69d55c 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -159,6 +159,7 @@ BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
 
 BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
 BUILTIN(__nvvm_exit, "v", "r")
+BUILTIN(__nvvm_reflect, "UicC*", "r")
 TARGET_BUILTIN(__nvvm_nanosleep, "vUi", "n", AND(SM_70, PTX63))
 
 // Min Max
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def
deleted file mode 100644
index 1528b18c82ea..000000000000
--- a/clang/include/clang/Basic/BuiltinsRISCV.def
+++ /dev/null
@@ -1,93 +0,0 @@
-//==- BuiltinsRISCV.def - RISC-V Builtin function database -------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the RISC-V-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// Zbb extension
-TARGET_BUILTIN(__builtin_riscv_orc_b_32, "UiUi", "nc", "zbb")
-TARGET_BUILTIN(__builtin_riscv_orc_b_64, "UWiUWi", "nc", "zbb,64bit")
-TARGET_BUILTIN(__builtin_riscv_clz_32, "UiUi", "nc", "zbb|xtheadbb")
-TARGET_BUILTIN(__builtin_riscv_clz_64, "UiUWi", "nc", "zbb|xtheadbb,64bit")
-TARGET_BUILTIN(__builtin_riscv_ctz_32, "UiUi", "nc", "zbb")
-TARGET_BUILTIN(__builtin_riscv_ctz_64, "UiUWi", "nc", "zbb,64bit")
-
-// Zbc or Zbkc extension
-TARGET_BUILTIN(__builtin_riscv_clmul_32, "UiUiUi", "nc", "zbc|zbkc")
-TARGET_BUILTIN(__builtin_riscv_clmul_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit")
-TARGET_BUILTIN(__builtin_riscv_clmulh_32, "UiUiUi", "nc", "zbc|zbkc,32bit")
-TARGET_BUILTIN(__builtin_riscv_clmulh_64, "UWiUWiUWi", "nc", "zbc|zbkc,64bit")
-TARGET_BUILTIN(__builtin_riscv_clmulr_32, "UiUiUi", "nc", "zbc,32bit")
-TARGET_BUILTIN(__builtin_riscv_clmulr_64, "UWiUWiUWi", "nc", "zbc,64bit")
-
-// Zbkx
-TARGET_BUILTIN(__builtin_riscv_xperm4_32, "UiUiUi", "nc", "zbkx,32bit")
-TARGET_BUILTIN(__builtin_riscv_xperm4_64, "UWiUWiUWi", "nc", "zbkx,64bit")
-TARGET_BUILTIN(__builtin_riscv_xperm8_32, "UiUiUi", "nc", "zbkx,32bit")
-TARGET_BUILTIN(__builtin_riscv_xperm8_64, "UWiUWiUWi", "nc", "zbkx,64bit")
-
-// Zbkb extension
-TARGET_BUILTIN(__builtin_riscv_brev8_32, "UiUi", "nc", "zbkb")
-TARGET_BUILTIN(__builtin_riscv_brev8_64, "UWiUWi", "nc", "zbkb,64bit")
-TARGET_BUILTIN(__builtin_riscv_zip_32, "UiUi", "nc", "zbkb,32bit")
-TARGET_BUILTIN(__builtin_riscv_unzip_32, "UiUi", "nc", "zbkb,32bit")
-
-// Zknd extension
-TARGET_BUILTIN(__builtin_riscv_aes32dsi, "UiUiUiIUi", "nc", "zknd,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes32dsmi, "UiUiUiIUi", "nc", "zknd,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes64ds, "UWiUWiUWi", "nc", "zknd,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64dsm, "UWiUWiUWi", "nc", "zknd,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64im, "UWiUWi", "nc", "zknd,64bit")
-
-// Zknd & Zkne
-TARGET_BUILTIN(__builtin_riscv_aes64ks1i, "UWiUWiIUi", "nc", "zknd|zkne,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64ks2, "UWiUWiUWi", "nc", "zknd|zkne,64bit")
-
-// Zkne extension
-TARGET_BUILTIN(__builtin_riscv_aes32esi, "UiUiUiIUi", "nc", "zkne,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes32esmi, "UiUiUiIUi", "nc", "zkne,32bit")
-TARGET_BUILTIN(__builtin_riscv_aes64es, "UWiUWiUWi", "nc", "zkne,64bit")
-TARGET_BUILTIN(__builtin_riscv_aes64esm, "UWiUWiUWi", "nc", "zkne,64bit")
-
-// Zknh extension
-TARGET_BUILTIN(__builtin_riscv_sha256sig0, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sig1, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sum0, "UiUi", "nc", "zknh")
-TARGET_BUILTIN(__builtin_riscv_sha256sum1, "UiUi", "nc", "zknh")
-
-TARGET_BUILTIN(__builtin_riscv_sha512sig0h, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig0l, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1h, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1l, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum0r, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum1r, "UiUiUi", "nc", "zknh,32bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig0, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sig1, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum0, "UWiUWi", "nc", "zknh,64bit")
-TARGET_BUILTIN(__builtin_riscv_sha512sum1, "UWiUWi", "nc", "zknh,64bit")
-
-// Zksed extension
-TARGET_BUILTIN(__builtin_riscv_sm4ed, "UiUiUiIUi", "nc", "zksed")
-TARGET_BUILTIN(__builtin_riscv_sm4ks, "UiUiUiIUi", "nc", "zksed")
-
-// Zksh extension
-TARGET_BUILTIN(__builtin_riscv_sm3p0, "UiUi", "nc", "zksh")
-TARGET_BUILTIN(__builtin_riscv_sm3p1, "UiUi", "nc", "zksh")
-
-// Zihintntl extension
-TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "zihintntl")
-TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "zihintntl")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td
new file mode 100644
index 000000000000..4cc89a8a9d8a
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -0,0 +1,148 @@
+//==- BuiltinsRISCV.td - RISC-V Builtin function database ---*- tablegen -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RISC-V-specific builtin function database.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class RISCVBuiltin<string prototype, string features = ""> : TargetBuiltin {
+  let Spellings = ["__builtin_riscv_" # NAME];
+  let Prototype = prototype;
+  let Features = features;
+}
+
+let Attributes = [NoThrow, Const] in {
+//===----------------------------------------------------------------------===//
+// Zbb extension.
+//===----------------------------------------------------------------------===//
+def orc_b_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">;
+def orc_b_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbb,64bit">;
+def clz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb|xtheadbb">;
+def clz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb|xtheadbb,64bit">;
+def ctz_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbb">;
+def ctz_64 : RISCVBuiltin<"unsigned int(uint64_t)", "zbb,64bit">;
+
+//===----------------------------------------------------------------------===//
+// Zbc or Zbkc extension.
+//===----------------------------------------------------------------------===//
+def clmul_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc">;
+def clmul_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">;
+def clmulh_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc|zbkc,32bit">;
+def clmulh_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc|zbkc,64bit">;
+def clmulr_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)", "zbc,32bit">;
+def clmulr_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)", "zbc,64bit">;
+
+//===----------------------------------------------------------------------===//
+// Zbkx extension.
+//===----------------------------------------------------------------------===//
+let Features = "zbkx,32bit" in {
+def xperm4_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def xperm8_32 : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+} // Features = "zbkx,32bit"
+
+let Features = "zbkx,64bit" in {
+def xperm4_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def xperm8_64 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zbkx,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zbkb extension.
+//===----------------------------------------------------------------------===//
+def brev8_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb">;
+def brev8_64 : RISCVBuiltin<"uint64_t(uint64_t)", "zbkb,64bit">;
+def zip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">;
+def unzip_32 : RISCVBuiltin<"unsigned int(unsigned int)", "zbkb,32bit">;
+
+//===----------------------------------------------------------------------===//
+// Zknd extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknd,32bit" in {
+def aes32dsi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+def aes32dsmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zknd,32bit"
+
+let Features = "zknd,64bit" in {
+def aes64ds : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64dsm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64im : RISCVBuiltin<"uint64_t(uint64_t)">;
+} // Features = "zknd,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zknd & Zkne extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknd|zkne,64bit" in {
+def aes64ks1i : RISCVBuiltin<"uint64_t(uint64_t, _Constant unsigned int)">;
+def aes64ks2 : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zknd|zkne,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zkne extension.
+//===----------------------------------------------------------------------===//
+let Features = "zkne,32bit" in {
+def aes32esi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+def aes32esmi : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zkne,32bit"
+
+let Features = "zkne,64bit" in {
+def aes64es : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+def aes64esm : RISCVBuiltin<"uint64_t(uint64_t, uint64_t)">;
+} // Features = "zkne,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zknh extension.
+//===----------------------------------------------------------------------===//
+let Features = "zknh" in {
+def sha256sig0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sig1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sum0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sha256sum1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+} // Features = "zknh"
+
+let Features = "zknh,32bit" in {
+def sha512sig0h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig0l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig1h : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sig1l : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sum0r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def sha512sum1r : RISCVBuiltin<"unsigned int(unsigned int, unsigned int)">;
+} // Features = "zknh,32bit"
+
+let Features = "zknh,64bit" in {
+def sha512sig0 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sig1 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sum0 : RISCVBuiltin<"uint64_t(uint64_t)">;
+def sha512sum1 : RISCVBuiltin<"uint64_t(uint64_t)">;
+} // Features = "zknh,64bit"
+
+//===----------------------------------------------------------------------===//
+// Zksed extension.
+//===----------------------------------------------------------------------===//
+let Features = "zksed" in {
+def sm4ed : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int )">;
+def sm4ks : RISCVBuiltin<"unsigned int(unsigned int, unsigned int, _Constant unsigned int)">;
+} // Features = "zksed"
+
+//===----------------------------------------------------------------------===//
+// Zksh extension.
+//===----------------------------------------------------------------------===//
+let Features = "zksh" in {
+def sm3p0 : RISCVBuiltin<"unsigned int(unsigned int)">;
+def sm3p1 : RISCVBuiltin<"unsigned int(unsigned int)">;
+} // Features = "zksh"
+
+} // Attributes = [Const, NoThrow]
+
+//===----------------------------------------------------------------------===//
+// Zihintntl extension.
+//===----------------------------------------------------------------------===//
+let Features = "zihintntl", Attributes = [CustomTypeChecking] in {
+def ntl_load : RISCVBuiltin<"void(...)">;
+def ntl_store : RISCVBuiltin<"void(...)">;
+} // Features = "zihintntl", Attributes = [CustomTypeChecking]
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 9689a0f48c3c..7785fb430c06 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -65,6 +65,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins
   SOURCE BuiltinsBPF.td
   TARGET ClangBuiltinsBPF)
 
+clang_tablegen(BuiltinsRISCV.inc -gen-clang-builtins
+  SOURCE BuiltinsRISCV.td
+  TARGET ClangBuiltinsRISCV)
+
 # ARM NEON and MVE
 clang_tablegen(arm_neon.inc -gen-arm-neon-sema
   SOURCE arm_neon.td
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 6765721ae700..975eca0ad9b6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -108,8 +108,10 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                 EnumCompareConditional]>;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
+def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def ImplicitIntConversion : DiagGroup<"implicit-int-conversion",
-                                     [ObjCSignedCharBoolImplicitIntConversion]>;
+                                     [Shorten64To32,
+                                      ObjCSignedCharBoolImplicitIntConversion]>;
 def ImplicitConstIntFloatConversion : DiagGroup<"implicit-const-int-float-conversion">;
 def ImplicitIntFloatConversion : DiagGroup<"implicit-int-float-conversion",
  [ImplicitConstIntFloatConversion]>;
@@ -631,7 +633,6 @@ def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified,
 def ShadowAll : DiagGroup<"shadow-all", [Shadow, ShadowFieldInConstructor,
                                          ShadowUncapturedLocal, ShadowField]>;
 
-def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def : DiagGroup<"sign-promo">;
 def SignCompare : DiagGroup<"sign-compare">;
 def SwitchDefault  : DiagGroup<"switch-default">;
@@ -942,7 +943,6 @@ def Conversion : DiagGroup<"conversion",
                             EnumConversion,
                             BitFieldEnumConversion,
                             FloatConversion,
-                            Shorten64To32,
                             IntConversion,
                             ImplicitIntConversion,
                             ImplicitFloatConversion,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b4dc4feee8e6..754733a6c5ff 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3728,6 +3728,8 @@ def err_sme_definition_using_zt0_in_non_sme2_target : Error<
   "function using ZT0 state requires 'sme2'">;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
+def err_sme_streaming_cannot_be_multiversioned : Error<
+  "streaming function cannot be multi-versioned">;
 def err_unknown_arm_state : Error<
   "unknown state '%0'">;
 def err_missing_arm_state : Error<
@@ -12126,9 +12128,9 @@ def warn_unsafe_buffer_operation : Warning<
 def note_unsafe_buffer_operation : Note<
   "used%select{| in pointer arithmetic| in buffer access}0 here">;
 def note_unsafe_buffer_variable_fixit_group : Note<
-  "change type of %0 to '%select{std::span|std::array|std::span::iterator}1' to preserve bounds information%select{|, and change %2 to '%select{std::span|std::array|std::span::iterator}1' to propagate bounds information between them}3">;
+  "change type of %0 to '%select{std::span' to preserve bounds information|std::array' to label it for hardening|std::span::iterator' to preserve bounds information}1%select{|, and change %2 to '%select{std::span|std::array|std::span::iterator}1' to propagate bounds information between them}3">;
 def note_unsafe_buffer_variable_fixit_together : Note<
-  "change type of %0 to '%select{std::span|std::array|std::span::iterator}1' to preserve bounds information"
+  "change type of %0 to '%select{std::span' to preserve bounds information|std::array' to label it for hardening|std::span::iterator' to preserve bounds information}1"
   "%select{|, and change %2 to safe types to make function %4 bounds-safe}3">;
 def note_safe_buffer_usage_suggestions_disabled : Note<
   "pass -fsafe-buffer-usage-suggestions to receive code hardening suggestions">;
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index 1ac182d4fce2..fa8969eb73dd 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_BASIC_IDENTIFIERTABLE_H
 #define LLVM_CLANG_BASIC_IDENTIFIERTABLE_H
 
+#include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticIDs.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TokenKinds.h"
@@ -86,19 +87,26 @@ enum { IdentifierInfoAlignment = 8 };
 static constexpr int ObjCOrBuiltinIDBits = 16;
 
 /// The "layout" of ObjCOrBuiltinID is:
-///  - The first value (0) represents "not a special identifier".
-///  - The next (NUM_OBJC_KEYWORDS - 1) values represent ObjCKeywordKinds (not
-///    including objc_not_keyword).
-///  - The next (NUM_INTERESTING_IDENTIFIERS - 1) values represent
-///    InterestingIdentifierKinds (not including not_interesting).
-///  - The rest of the values represent builtin IDs (not including NotBuiltin).
-static constexpr int FirstObjCKeywordID = 1;
-static constexpr int LastObjCKeywordID =
-    FirstObjCKeywordID + tok::NUM_OBJC_KEYWORDS - 2;
-static constexpr int FirstInterestingIdentifierID = LastObjCKeywordID + 1;
-static constexpr int LastInterestingIdentifierID =
-    FirstInterestingIdentifierID + tok::NUM_INTERESTING_IDENTIFIERS - 2;
-static constexpr int FirstBuiltinID = LastInterestingIdentifierID + 1;
+///  - ObjCKeywordKind enumerators
+///  - InterestingIdentifierKind enumerators
+///  - Builtin::ID enumerators
+///  - NonSpecialIdentifier
+enum class ObjCKeywordOrInterestingOrBuiltin {
+#define OBJC_AT_KEYWORD(X) objc_##X,
+#include "clang/Basic/TokenKinds.def"
+  NUM_OBJC_KEYWORDS,
+
+#define INTERESTING_IDENTIFIER(X) X,
+#include "clang/Basic/TokenKinds.def"
+  NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS,
+
+  NotBuiltin,
+#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
+#include "clang/Basic/Builtins.inc"
+  FirstTSBuiltin,
+
+  NonSpecialIdentifier = 65534
+};
 
 /// One of these records is kept for each identifier that
 /// is lexed.  This contains information about whether the token was \#define'd,
@@ -113,9 +121,7 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(tok::TokenKind)
   unsigned TokenID : 9;
 
-  // ObjC keyword ('protocol' in '@protocol') or builtin (__builtin_inf).
-  // First NUM_OBJC_KEYWORDS values are for Objective-C,
-  // the remaining values are for builtins.
+  LLVM_PREFERRED_TYPE(ObjCKeywordOrInterestingOrBuiltin)
   unsigned ObjCOrBuiltinID : ObjCOrBuiltinIDBits;
 
   // True if there is a #define for this.
@@ -198,13 +204,16 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   llvm::StringMapEntry<IdentifierInfo *> *Entry = nullptr;
 
   IdentifierInfo()
-      : TokenID(tok::identifier), ObjCOrBuiltinID(0), HasMacro(false),
-        HadMacro(false), IsExtension(false), IsFutureCompatKeyword(false),
-        IsPoisoned(false), IsCPPOperatorKeyword(false),
-        NeedsHandleIdentifier(false), IsFromAST(false), ChangedAfterLoad(false),
-        FEChangedAfterLoad(false), RevertedTokenID(false), OutOfDate(false),
-        IsModulesImport(false), IsMangledOpenMPVariantName(false),
-        IsDeprecatedMacro(false), IsRestrictExpansion(false), IsFinal(false) {}
+      : TokenID(tok::identifier),
+        ObjCOrBuiltinID(llvm::to_underlying(
+            ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier)),
+        HasMacro(false), HadMacro(false), IsExtension(false),
+        IsFutureCompatKeyword(false), IsPoisoned(false),
+        IsCPPOperatorKeyword(false), NeedsHandleIdentifier(false),
+        IsFromAST(false), ChangedAfterLoad(false), FEChangedAfterLoad(false),
+        RevertedTokenID(false), OutOfDate(false), IsModulesImport(false),
+        IsMangledOpenMPVariantName(false), IsDeprecatedMacro(false),
+        IsRestrictExpansion(false), IsFinal(false) {}
 
 public:
   IdentifierInfo(const IdentifierInfo &) = delete;
@@ -332,42 +341,66 @@ public:
   ///
   /// For example, 'class' will return tok::objc_class if ObjC is enabled.
   tok::ObjCKeywordKind getObjCKeywordID() const {
-    static_assert(FirstObjCKeywordID == 1,
-                  "hard-coding this assumption to simplify code");
-    if (ObjCOrBuiltinID <= LastObjCKeywordID)
-      return tok::ObjCKeywordKind(ObjCOrBuiltinID);
-    else
-      return tok::objc_not_keyword;
+    assert(0 == llvm::to_underlying(
+                    ObjCKeywordOrInterestingOrBuiltin::objc_not_keyword));
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value < ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS)
+      return static_cast<tok::ObjCKeywordKind>(ObjCOrBuiltinID);
+    return tok::objc_not_keyword;
+  }
+  void setObjCKeywordID(tok::ObjCKeywordKind ID) {
+    assert(0 == llvm::to_underlying(
+                    ObjCKeywordOrInterestingOrBuiltin::objc_not_keyword));
+    ObjCOrBuiltinID = ID;
+    assert(getObjCKeywordID() == ID && "ID too large for field!");
   }
-  void setObjCKeywordID(tok::ObjCKeywordKind ID) { ObjCOrBuiltinID = ID; }
 
   /// Return a value indicating whether this is a builtin function.
-  ///
-  /// 0 is not-built-in. 1+ are specific builtin functions.
   unsigned getBuiltinID() const {
-    if (ObjCOrBuiltinID >= FirstBuiltinID)
-      return 1 + (ObjCOrBuiltinID - FirstBuiltinID);
-    else
-      return 0;
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value > ObjCKeywordOrInterestingOrBuiltin::
+                    NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS &&
+        Value != ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier) {
+      auto FirstBuiltin =
+          llvm::to_underlying(ObjCKeywordOrInterestingOrBuiltin::NotBuiltin);
+      return static_cast<Builtin::ID>(ObjCOrBuiltinID - FirstBuiltin);
+    }
+    return Builtin::ID::NotBuiltin;
   }
   void setBuiltinID(unsigned ID) {
-    assert(ID != 0);
-    ObjCOrBuiltinID = FirstBuiltinID + (ID - 1);
+    assert(ID != Builtin::ID::NotBuiltin);
+    auto FirstBuiltin =
+        llvm::to_underlying(ObjCKeywordOrInterestingOrBuiltin::NotBuiltin);
+    ObjCOrBuiltinID = ID + FirstBuiltin;
     assert(getBuiltinID() == ID && "ID too large for field!");
   }
-  void clearBuiltinID() { ObjCOrBuiltinID = 0; }
+  void clearBuiltinID() {
+    ObjCOrBuiltinID = llvm::to_underlying(
+        ObjCKeywordOrInterestingOrBuiltin::NonSpecialIdentifier);
+  }
 
   tok::InterestingIdentifierKind getInterestingIdentifierID() const {
-    if (ObjCOrBuiltinID >= FirstInterestingIdentifierID &&
-        ObjCOrBuiltinID <= LastInterestingIdentifierID)
-      return tok::InterestingIdentifierKind(
-          1 + (ObjCOrBuiltinID - FirstInterestingIdentifierID));
-    else
-      return tok::not_interesting;
+    auto Value =
+        static_cast<ObjCKeywordOrInterestingOrBuiltin>(ObjCOrBuiltinID);
+    if (Value > ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS &&
+        Value < ObjCKeywordOrInterestingOrBuiltin::
+                    NUM_OBJC_KEYWORDS_AND_INTERESTING_IDENTIFIERS) {
+      auto FirstInterestingIdentifier =
+          1 + llvm::to_underlying(
+                  ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS);
+      return static_cast<tok::InterestingIdentifierKind>(
+          ObjCOrBuiltinID - FirstInterestingIdentifier);
+    }
+    return tok::not_interesting;
   }
   void setInterestingIdentifierID(unsigned ID) {
     assert(ID != tok::not_interesting);
-    ObjCOrBuiltinID = FirstInterestingIdentifierID + (ID - 1);
+    auto FirstInterestingIdentifier =
+        1 + llvm::to_underlying(
+                ObjCKeywordOrInterestingOrBuiltin::NUM_OBJC_KEYWORDS);
+    ObjCOrBuiltinID = ID + FirstInterestingIdentifier;
     assert(getInterestingIdentifierID() == ID && "ID too large for field!");
   }
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index c1cc5548ef10..862952d336ef 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -30,27 +30,6 @@
 
 namespace clang {
 
-/// Bitfields of LangOptions, split out from LangOptions in order to ensure that
-/// this large collection of bitfields is a trivial class type.
-class LangOptionsBase {
-  friend class CompilerInvocation;
-  friend class CompilerInvocationBase;
-
-public:
-  // Define simple language options (with no accessors).
-#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits;
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)
-#include "clang/Basic/LangOptions.def"
-
-protected:
-  // Define language options of enumeration type. These are private, and will
-  // have accessors (below).
-#define LANGOPT(Name, Bits, Default, Description)
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \
-  unsigned Name : Bits;
-#include "clang/Basic/LangOptions.def"
-};
-
 /// In the Microsoft ABI, this controls the placement of virtual displacement
 /// members used to implement virtual inheritance.
 enum class MSVtorDispMode { Never, ForVBaseOverride, ForVFTable };
@@ -78,9 +57,12 @@ enum class ShaderStage {
   Invalid,
 };
 
-/// Keeps track of the various options that can be
-/// enabled, which controls the dialect of C or C++ that is accepted.
-class LangOptions : public LangOptionsBase {
+/// Bitfields of LangOptions, split out from LangOptions in order to ensure that
+/// this large collection of bitfields is a trivial class type.
+class LangOptionsBase {
+  friend class CompilerInvocation;
+  friend class CompilerInvocationBase;
+
 public:
   using Visibility = clang::Visibility;
   using RoundingMode = llvm::RoundingMode;
@@ -416,6 +398,24 @@ public:
 
   enum ComplexRangeKind { CX_Full, CX_Limited, CX_Fortran, CX_None };
 
+  // Define simple language options (with no accessors).
+#define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits;
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)
+#include "clang/Basic/LangOptions.def"
+
+protected:
+  // Define language options of enumeration type. These are private, and will
+  // have accessors (below).
+#define LANGOPT(Name, Bits, Default, Description)
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \
+  LLVM_PREFERRED_TYPE(Type) \
+  unsigned Name : Bits;
+#include "clang/Basic/LangOptions.def"
+};
+
+/// Keeps track of the various options that can be
+/// enabled, which controls the dialect of C or C++ that is accepted.
+class LangOptions : public LangOptionsBase {
 public:
   /// The used language standard.
   LangStandard::Kind LangStd;
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 62786e3ac865..30ec9c993150 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -118,7 +118,7 @@ public:
     /// of header files.
     ModuleMapModule,
 
-    /// This is a C++ 20 header unit.
+    /// This is a C++20 header unit.
     ModuleHeaderUnit,
 
     /// This is a C++20 module interface unit.
@@ -127,10 +127,10 @@ public:
     /// This is a C++20 module implementation unit.
     ModuleImplementationUnit,
 
-    /// This is a C++ 20 module partition interface.
+    /// This is a C++20 module partition interface.
     ModulePartitionInterface,
 
-    /// This is a C++ 20 module partition implementation.
+    /// This is a C++20 module partition implementation.
     ModulePartitionImplementation,
 
     /// This is the explicit Global Module Fragment of a modular TU.
diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index afdd0e8983c9..4456f4afd142 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 // Represents the Construct/Directive kind of a pragma directive. Note the
@@ -65,8 +66,9 @@ enum class OpenACCDirectiveKind {
   Invalid,
 };
 
-inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
-                                             OpenACCDirectiveKind K) {
+template <typename StreamTy>
+inline StreamTy &PrintOpenACCDirectiveKind(StreamTy &Out,
+                                           OpenACCDirectiveKind K) {
   switch (K) {
   case OpenACCDirectiveKind::Parallel:
     return Out << "parallel";
@@ -134,6 +136,16 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   llvm_unreachable("Uncovered directive kind");
 }
 
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
+                                             OpenACCDirectiveKind K) {
+  return PrintOpenACCDirectiveKind(Out, K);
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
+                                     OpenACCDirectiveKind K) {
+  return PrintOpenACCDirectiveKind(Out, K);
+}
+
 enum class OpenACCAtomicKind {
   Read,
   Write,
@@ -253,8 +265,8 @@ enum class OpenACCClauseKind {
   Invalid,
 };
 
-inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
-                                             OpenACCClauseKind K) {
+template <typename StreamTy>
+inline StreamTy &PrintOpenACCClauseKind(StreamTy &Out, OpenACCClauseKind K) {
   switch (K) {
   case OpenACCClauseKind::Finalize:
     return Out << "finalize";
@@ -387,6 +399,17 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   }
   llvm_unreachable("Uncovered clause kind");
 }
+
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
+                                             OpenACCClauseKind K) {
+  return PrintOpenACCClauseKind(Out, K);
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
+                                     OpenACCClauseKind K) {
+  return PrintOpenACCClauseKind(Out, K);
+}
+
 enum class OpenACCDefaultClauseKind {
   /// 'none' option.
   None,
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 9d03800840fc..b4e3ae573b95 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -296,3 +296,9 @@ def OMPTargetTeamsGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPParallelGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPTargetParallelGenericLoopDirective : StmtNode<OMPLoopDirective>;
 def OMPErrorDirective : StmtNode<OMPExecutableDirective>;
+
+// OpenACC Constructs.
+def OpenACCConstructStmt : StmtNode<Stmt, /*abstract=*/1>;
+def OpenACCAssociatedStmtConstruct
+    : StmtNode<OpenACCConstructStmt, /*abstract=*/1>;
+def OpenACCComputeConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index a4abaaef44c0..4333830bf34f 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -159,7 +159,7 @@ namespace clang {
     FirstRVVBuiltin = clang::Builtin::FirstTSBuiltin,
     LastRVVBuiltin = RISCVVector::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsRISCV.def"
+#include "clang/Basic/BuiltinsRISCV.inc"
     LastTSBuiltin
   };
   } // namespace RISCV
diff --git a/clang/include/clang/Basic/Visibility.h b/clang/include/clang/Basic/Visibility.h
index 1e196300be42..b9693e613224 100644
--- a/clang/include/clang/Basic/Visibility.h
+++ b/clang/include/clang/Basic/Visibility.h
@@ -51,8 +51,11 @@ inline Visibility minVisibility(Visibility L, Visibility R) {
 }
 
 class LinkageInfo {
+  LLVM_PREFERRED_TYPE(Linkage)
   uint8_t linkage_    : 3;
+  LLVM_PREFERRED_TYPE(Visibility)
   uint8_t visibility_ : 2;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t explicit_   : 1;
 
   void setVisibility(Visibility V, bool E) { visibility_ = V; explicit_ = E; }
diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index e388901b8a50..811f33407368 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -564,35 +564,45 @@ class CGFunctionInfo final
   unsigned EffectiveCallingConvention : 8;
 
   /// The clang::CallingConv that this was originally created with.
+  LLVM_PREFERRED_TYPE(CallingConv)
   unsigned ASTCallingConvention : 6;
 
   /// Whether this is an instance method.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InstanceMethod : 1;
 
   /// Whether this is a chain call.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ChainCall : 1;
 
   /// Whether this function is called by forwarding arguments.
   /// This doesn't support inalloca or varargs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DelegateCall : 1;
 
   /// Whether this function is a CMSE nonsecure call
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CmseNSCall : 1;
 
   /// Whether this function is noreturn.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoReturn : 1;
 
   /// Whether this function is returns-retained.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ReturnsRetained : 1;
 
   /// Whether this function saved caller registers.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoCallerSavedRegs : 1;
 
   /// How many arguments to pass inreg.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasRegParm : 1;
   unsigned RegParm : 3;
 
   /// Whether this function has nocf_check attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoCfCheck : 1;
 
   /// Log 2 of the maximum vector width.
@@ -604,6 +614,7 @@ class CGFunctionInfo final
   /// passing non-trivial types with inalloca.  Not part of the profile.
   llvm::StructType *ArgStruct;
   unsigned ArgStructAlign : 31;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasExtParameterInfos : 1;
 
   unsigned NumArgs;
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 3ee1bcf2a69c..908bc87c14b1 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -232,10 +232,12 @@ public:
   bool IsDXCMode() const { return Mode == DXCMode; }
 
   /// Only print tool bindings, don't build any jobs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCCPrintBindings : 1;
 
   /// Set CC_PRINT_OPTIONS mode, which is like -v but logs the commands to
   /// CCPrintOptionsFilename or to stderr.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintOptions : 1;
 
   /// The format of the header information that is emitted. If CC_PRINT_HEADERS
@@ -252,17 +254,21 @@ public:
   /// Set CC_LOG_DIAGNOSTICS mode, which causes the frontend to log diagnostics
   /// to CCLogDiagnosticsFilename or to stderr, in a stable machine readable
   /// format.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCLogDiagnostics : 1;
 
   /// Whether the driver is generating diagnostics for debugging purposes.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCGenDiagnostics : 1;
 
   /// Set CC_PRINT_PROC_STAT mode, which causes the driver to dump
   /// performance report to CC_PRINT_PROC_STAT_FILE or to stdout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintProcessStats : 1;
 
   /// Set CC_PRINT_INTERNAL_STAT mode, which causes the driver to dump internal
   /// performance report to CC_PRINT_INTERNAL_STAT_FILE or to stdout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CCPrintInternalStats : 1;
 
   /// Pointer to the ExecuteCC1Tool function, if available.
@@ -303,9 +309,11 @@ private:
 
   /// Whether to check that input files exist when constructing compilation
   /// jobs.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CheckInputsExist : 1;
   /// Whether to probe for PCH files on disk, in order to upgrade
   /// -include foo.h to -include-pch foo.h.pch.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ProbePrecompiled : 1;
 
 public:
@@ -319,6 +327,7 @@ public:
 
 private:
   /// Certain options suppress the 'no input files' warning.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SuppressMissingInputWarning : 1;
 
   /// Cache of all the ToolChains in use by the driver.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4b232b8aab72..c625d0dd1c0c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4414,7 +4414,7 @@ def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=
 def march_EQ : Joined<["-"], "march=">, Group<m_Group>,
   Flags<[TargetSpecific]>, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"For a list of available architectures for the target use '-mcpu=help'">;
-def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>;
+def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Visibility<[ClangOption, FlangOption]>;
 def inline_asm_EQ : Joined<["-"], "inline-asm=">, Group<m_Group>,
   Visibility<[ClangOption, CC1Option]>,
   Values<"att,intel">,
@@ -4614,6 +4614,10 @@ def msave_restore : Flag<["-"], "msave-restore">, Group<m_riscv_Features_Group>,
   HelpText<"Enable using library calls for save and restore">;
 def mno_save_restore : Flag<["-"], "mno-save-restore">, Group<m_riscv_Features_Group>,
   HelpText<"Disable using library calls for save and restore">;
+def mforced_sw_shadow_stack : Flag<["-"], "mforced-sw-shadow-stack">, Group<m_riscv_Features_Group>,
+  HelpText<"Force using software shadow stack when shadow-stack enabled">;
+def mno_forced_sw_shadow_stack : Flag<["-"], "mno-forced-sw-shadow-stack">, Group<m_riscv_Features_Group>,
+  HelpText<"Not force using software shadow stack when shadow-stack enabled">;
 } // let Flags = [TargetSpecific]
 let Flags = [TargetSpecific] in {
 def menable_experimental_extensions : Flag<["-"], "menable-experimental-extensions">, Group<m_Group>,
@@ -5335,7 +5339,7 @@ def print_rocm_search_dirs : Flag<["-", "--"], "print-rocm-search-dirs">,
   HelpText<"Print the paths used for finding ROCm installation">,
   Visibility<[ClangOption, CLOption]>;
 def print_runtime_dir : Flag<["-", "--"], "print-runtime-dir">,
-  HelpText<"Print the directory pathname containing clangs runtime libraries">,
+  HelpText<"Print the directory pathname containing Clang's runtime libraries">,
   Visibility<[ClangOption, CLOption]>;
 def print_diagnostic_options : Flag<["-", "--"], "print-diagnostic-options">,
   HelpText<"Print all of Clang's warning options">,
@@ -5392,11 +5396,13 @@ def regcall4 : Flag<["-"], "regcall4">, Group<m_Group>,
   MarshallingInfoFlag<LangOpts<"RegCall4">>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
-  HelpText<"Save intermediate compilation results.">;
+  HelpText<"Save intermediate compilation results. <arg> can be set to 'cwd' for "
+  "current working directory, or 'obj' which will save temporary files in the "
+  "same directory as the final output file">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[NoXarchOption]>,
   Visibility<[ClangOption, FlangOption, FC1Option]>,
   Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
-  HelpText<"Save intermediate compilation results">;
+  HelpText<"Alias for --save-temps=cwd">;
 def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[NoXarchOption]>,
   HelpText<"Save llvm statistics.">;
 def save_stats : Flag<["-", "--"], "save-stats">, Flags<[NoXarchOption]>,
@@ -5823,6 +5829,18 @@ def mvis3 : Flag<["-"], "mvis3">, Group<m_sparc_Features_Group>;
 def mno_vis3 : Flag<["-"], "mno-vis3">, Group<m_sparc_Features_Group>;
 def mhard_quad_float : Flag<["-"], "mhard-quad-float">, Group<m_sparc_Features_Group>;
 def msoft_quad_float : Flag<["-"], "msoft-quad-float">, Group<m_sparc_Features_Group>;
+foreach i = 1 ... 7 in
+  def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the G"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_o#i : Flag<["-"], "ffixed-o"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the O"#i#" register (SPARC only)">;
+foreach i = 0 ... 7 in
+  def ffixed_l#i : Flag<["-"], "ffixed-l"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the L"#i#" register (SPARC only)">;
+foreach i = 0 ... 5 in
+  def ffixed_i#i : Flag<["-"], "ffixed-i"#i>, Group<m_sparc_Features_Group>,
+    HelpText<"Reserve the I"#i#" register (SPARC only)">;
 } // let Flags = [TargetSpecific]
 
 // M68k features flags
@@ -8448,8 +8466,8 @@ def _SLASH_ZW : CLJoined<"ZW">;
 // clang-dxc Options
 //===----------------------------------------------------------------------===//
 
-def dxc_Group : OptionGroup<"<clang-dxc options>">, Visibility<[DXCOption]>,
-  HelpText<"dxc compatibility options">;
+def dxc_Group : OptionGroup<"clang-dxc options">, Visibility<[DXCOption]>,
+  HelpText<"dxc compatibility options.">;
 class DXCFlag<string name> : Option<["/", "-"], name, KIND_FLAG>,
   Group<dxc_Group>, Visibility<[DXCOption]>;
 class DXCJoinedOrSeparate<string name> : Option<["/", "-"], name,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index cb14d9882540..737cbfced9e9 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1010,8 +1010,9 @@ struct FormatStyle {
   /// \version 3.7
   DefinitionReturnTypeBreakingStyle AlwaysBreakAfterDefinitionReturnType;
 
-  /// The function declaration return type breaking style to use.
+  /// This option is renamed to ``BreakAfterReturnType``.
   /// \version 3.8
+  /// @deprecated
   ReturnTypeBreakingStyle AlwaysBreakAfterReturnType;
 
   /// If ``true``, always break before multiline string literals.
@@ -1075,9 +1076,10 @@ struct FormatStyle {
     BTDS_Yes
   };
 
-  /// The template declaration breaking style to use.
+  /// This option is renamed to ``BreakTemplateDeclarations``.
   /// \version 3.4
-  BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations;
+  /// @deprecated
+  // BreakTemplateDeclarationsStyle AlwaysBreakTemplateDeclarations;
 
   /// A vector of strings that should be interpreted as attributes/qualifiers
   /// instead of identifiers. This can be useful for language extensions or
@@ -1575,6 +1577,10 @@ struct FormatStyle {
   /// \version 16
   AttributeBreakingStyle BreakAfterAttributes;
 
+  /// The function declaration return type breaking style to use.
+  /// \version 19
+  // ReturnTypeBreakingStyle BreakAfterReturnType;
+
   /// If ``true``, clang-format will always break after a Json array ``[``
   /// otherwise it will scan until the closing ``]`` to determine if it should
   /// add newlines between elements (prettier compatible).
@@ -2293,6 +2299,10 @@ struct FormatStyle {
   /// \version 7
   BreakInheritanceListStyle BreakInheritanceList;
 
+  /// The template declaration breaking style to use.
+  /// \version 19
+  BreakTemplateDeclarationsStyle BreakTemplateDeclarations;
+
   /// If ``true``, consecutive namespace declarations will be on the same
   /// line. If ``false``, each namespace is declared on a new line.
   /// \code
@@ -4817,8 +4827,6 @@ struct FormatStyle {
            AlwaysBreakAfterReturnType == R.AlwaysBreakAfterReturnType &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
-           AlwaysBreakTemplateDeclarations ==
-               R.AlwaysBreakTemplateDeclarations &&
            AttributeMacros == R.AttributeMacros &&
            BinPackArguments == R.BinPackArguments &&
            BinPackParameters == R.BinPackParameters &&
@@ -4836,6 +4844,7 @@ struct FormatStyle {
            BreakConstructorInitializers == R.BreakConstructorInitializers &&
            BreakInheritanceList == R.BreakInheritanceList &&
            BreakStringLiterals == R.BreakStringLiterals &&
+           BreakTemplateDeclarations == R.BreakTemplateDeclarations &&
            ColumnLimit == R.ColumnLimit && CommentPragmas == R.CommentPragmas &&
            CompactNamespaces == R.CompactNamespaces &&
            ConstructorInitializerIndentWidth ==
diff --git a/clang/include/clang/Lex/PPCallbacks.h b/clang/include/clang/Lex/PPCallbacks.h
index e3942af7be28..dfc74b52686f 100644
--- a/clang/include/clang/Lex/PPCallbacks.h
+++ b/clang/include/clang/Lex/PPCallbacks.h
@@ -127,8 +127,10 @@ public:
   /// \param RelativePath The path relative to SearchPath, at which the include
   /// file was found. This is equal to FileName except for framework includes.
   ///
-  /// \param Imported The module, whenever an inclusion directive was
-  /// automatically turned into a module import or null otherwise.
+  /// \param SuggestedModule The module suggested for this header, if any.
+  ///
+  /// \param ModuleImported Whether this include was translated into import of
+  /// \p SuggestedModule.
   ///
   /// \param FileType The characteristic kind, indicates whether a file or
   /// directory holds normal user code, system code, or system code which is
@@ -139,7 +141,8 @@ public:
                                   bool IsAngled, CharSourceRange FilenameRange,
                                   OptionalFileEntryRef File,
                                   StringRef SearchPath, StringRef RelativePath,
-                                  const Module *Imported,
+                                  const Module *SuggestedModule,
+                                  bool ModuleImported,
                                   SrcMgr::CharacteristicKind FileType) {}
 
   /// Callback invoked whenever a submodule was entered.
@@ -473,14 +476,15 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     First->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled,
                               FilenameRange, File, SearchPath, RelativePath,
-                              Imported, FileType);
+                              SuggestedModule, ModuleImported, FileType);
     Second->InclusionDirective(HashLoc, IncludeTok, FileName, IsAngled,
                                FilenameRange, File, SearchPath, RelativePath,
-                               Imported, FileType);
+                               SuggestedModule, ModuleImported, FileType);
   }
 
   void EnteredSubmodule(Module *M, SourceLocation ImportLoc,
diff --git a/clang/include/clang/Lex/PreprocessingRecord.h b/clang/include/clang/Lex/PreprocessingRecord.h
index 5ddf024186f8..437d8e4cc174 100644
--- a/clang/include/clang/Lex/PreprocessingRecord.h
+++ b/clang/include/clang/Lex/PreprocessingRecord.h
@@ -532,7 +532,8 @@ class Token;
                             StringRef FileName, bool IsAngled,
                             CharSourceRange FilenameRange,
                             OptionalFileEntryRef File, StringRef SearchPath,
-                            StringRef RelativePath, const Module *Imported,
+                            StringRef RelativePath,
+                            const Module *SuggestedModule, bool ModuleImported,
                             SrcMgr::CharacteristicKind FileType) override;
     void Ifdef(SourceLocation Loc, const Token &MacroNameTok,
                const MacroDefinition &MD) override;
diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 020ddd36cf73..aafe227b8408 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -34,9 +34,13 @@ public:
   class Policy {
     friend class AnalysisBasedWarnings;
     // The warnings to run.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableCheckFallThrough : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableCheckUnreachable : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableThreadSafetyAnalysis : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned enableConsumedAnalysis : 1;
   public:
     Policy();
diff --git a/clang/include/clang/Sema/CodeCompleteConsumer.h b/clang/include/clang/Sema/CodeCompleteConsumer.h
index 274eaac819af..a2028e40f83d 100644
--- a/clang/include/clang/Sema/CodeCompleteConsumer.h
+++ b/clang/include/clang/Sema/CodeCompleteConsumer.h
@@ -581,6 +581,7 @@ private:
   unsigned Priority : 16;
 
   /// The availability of this code-completion result.
+  LLVM_PREFERRED_TYPE(CXAvailabilityKind)
   unsigned Availability : 2;
 
   /// The name of the parent context.
diff --git a/clang/include/clang/Sema/CodeCompleteOptions.h b/clang/include/clang/Sema/CodeCompleteOptions.h
index a3403b01dcde..d8dc386c4be4 100644
--- a/clang/include/clang/Sema/CodeCompleteOptions.h
+++ b/clang/include/clang/Sema/CodeCompleteOptions.h
@@ -9,18 +9,23 @@
 #ifndef LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H
 #define LLVM_CLANG_SEMA_CODECOMPLETEOPTIONS_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace clang {
 
 /// Options controlling the behavior of code completion.
 class CodeCompleteOptions {
 public:
   /// Show macros in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeMacros : 1;
 
   /// Show code patterns in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeCodePatterns : 1;
 
   /// Show top-level decls in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeGlobals : 1;
 
   /// Show decls in namespace (including the global namespace) in code
@@ -29,18 +34,22 @@ public:
   /// Currently, this only works when completing qualified IDs (i.e.
   /// `Sema::CodeCompleteQualifiedId`).
   /// FIXME: consider supporting more completion cases with this option.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeNamespaceLevelDecls : 1;
 
   /// Show brief documentation comments in code completion results.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeBriefComments : 1;
 
   /// Hint whether to load data from the external AST to provide full results.
   /// If false, namespace-level declarations and macros from the preamble may be
   /// omitted.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned LoadExternal : 1;
 
   /// Include results after corrections (small fix-its), e.g. change '.' to '->'
   /// on member access, etc.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncludeFixIts : 1;
 
   CodeCompleteOptions()
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 77638def6006..d161147527dc 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -353,36 +353,57 @@ public:
 
 private:
   // storage-class-specifier
-  /*SCS*/unsigned StorageClassSpec : 3;
-  /*TSCS*/unsigned ThreadStorageClassSpec : 2;
+  LLVM_PREFERRED_TYPE(SCS)
+  unsigned StorageClassSpec : 3;
+  LLVM_PREFERRED_TYPE(TSCS)
+  unsigned ThreadStorageClassSpec : 2;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SCS_extern_in_linkage_spec : 1;
 
   // type-specifier
-  /*TypeSpecifierWidth*/ unsigned TypeSpecWidth : 2;
-  /*TSC*/unsigned TypeSpecComplex : 2;
-  /*TSS*/unsigned TypeSpecSign : 2;
-  /*TST*/unsigned TypeSpecType : 7;
+  LLVM_PREFERRED_TYPE(TypeSpecifierWidth)
+  unsigned TypeSpecWidth : 2;
+  LLVM_PREFERRED_TYPE(TSC)
+  unsigned TypeSpecComplex : 2;
+  LLVM_PREFERRED_TYPE(TypeSpecifierSign)
+  unsigned TypeSpecSign : 2;
+  LLVM_PREFERRED_TYPE(TST)
+  unsigned TypeSpecType : 7;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecVector : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecPixel : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeAltiVecBool : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecOwned : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecPipe : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned TypeSpecSat : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ConstrainedAuto : 1;
 
   // type-qualifiers
+  LLVM_PREFERRED_TYPE(TQ)
   unsigned TypeQualifiers : 5;  // Bitwise OR of TQ.
 
   // function-specifier
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_inline_specified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_forceinline_specified: 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_virtual_specified : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FS_noreturn_specified : 1;
 
   // friend-specifier
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Friend_specified : 1;
 
   // constexpr-specifier
+  LLVM_PREFERRED_TYPE(ConstexprSpecKind)
   unsigned ConstexprSpecifier : 2;
 
   union {
@@ -1246,6 +1267,7 @@ struct DeclaratorChunk {
 
   struct PointerTypeInfo {
     /// The type qualifiers: const/volatile/restrict/unaligned/atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     /// The location of the const-qualifier, if any.
@@ -1279,12 +1301,15 @@ struct DeclaratorChunk {
   struct ArrayTypeInfo {
     /// The type qualifiers for the array:
     /// const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     /// True if this dimension included the 'static' keyword.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned hasStatic : 1;
 
     /// True if this dimension was [*].  In this case, NumElts is null.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isStar : 1;
 
     /// This is the size of the array, or null if [] or [*] was specified.
@@ -1331,28 +1356,35 @@ struct DeclaratorChunk {
     /// hasPrototype - This is true if the function had at least one typed
     /// parameter.  If the function is () or (a,b,c), then it has no prototype,
     /// and is treated as a K&R-style function.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned hasPrototype : 1;
 
     /// isVariadic - If this function has a prototype, and if that
     /// proto ends with ',...)', this is true. When true, EllipsisLoc
     /// contains the location of the ellipsis.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isVariadic : 1;
 
     /// Can this declaration be a constructor-style initializer?
+    LLVM_PREFERRED_TYPE(bool)
     unsigned isAmbiguous : 1;
 
     /// Whether the ref-qualifier (if any) is an lvalue reference.
     /// Otherwise, it's an rvalue reference.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned RefQualifierIsLValueRef : 1;
 
     /// ExceptionSpecType - An ExceptionSpecificationType value.
+    LLVM_PREFERRED_TYPE(ExceptionSpecificationType)
     unsigned ExceptionSpecType : 4;
 
     /// DeleteParams - If this is true, we need to delete[] Params.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DeleteParams : 1;
 
     /// HasTrailingReturnType - If this is true, a trailing return type was
     /// specified.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned HasTrailingReturnType : 1;
 
     /// The location of the left parenthesis in the source.
@@ -1567,6 +1599,7 @@ struct DeclaratorChunk {
   struct BlockPointerTypeInfo {
     /// For now, sema will catch these as invalid.
     /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
 
     void destroy() {
@@ -1575,6 +1608,7 @@ struct DeclaratorChunk {
 
   struct MemberPointerTypeInfo {
     /// The type qualifiers: const/volatile/restrict/__unaligned/_Atomic.
+    LLVM_PREFERRED_TYPE(DeclSpec::TQ)
     unsigned TypeQuals : 5;
     /// Location of the '*' token.
     SourceLocation StarLoc;
@@ -1767,6 +1801,7 @@ private:
   /// The bindings.
   Binding *Bindings;
   unsigned NumBindings : 31;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DeleteBindings : 1;
 
   friend class Declarator;
@@ -1883,33 +1918,42 @@ private:
   SmallVector<DeclaratorChunk, 8> DeclTypeInfo;
 
   /// InvalidType - Set by Sema::GetTypeForDeclarator().
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InvalidType : 1;
 
   /// GroupingParens - Set by Parser::ParseParenDeclarator().
+  LLVM_PREFERRED_TYPE(bool)
   unsigned GroupingParens : 1;
 
   /// FunctionDefinition - Is this Declarator for a function or member
   /// definition and, if so, what kind?
   ///
   /// Actually a FunctionDefinitionKind.
+  LLVM_PREFERRED_TYPE(FunctionDefinitionKind)
   unsigned FunctionDefinition : 2;
 
   /// Is this Declarator a redeclaration?
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Redeclaration : 1;
 
   /// true if the declaration is preceded by \c __extension__.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Extension : 1;
 
   /// Indicates whether this is an Objective-C instance variable.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ObjCIvar : 1;
 
   /// Indicates whether this is an Objective-C 'weak' property.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ObjCWeakProperty : 1;
 
   /// Indicates whether the InlineParams / InlineBindings storage has been used.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InlineStorageUsed : 1;
 
   /// Indicates whether this declarator has an initializer.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasInitializer : 1;
 
   /// Attributes attached to the declarator.
diff --git a/clang/include/clang/Sema/DelayedDiagnostic.h b/clang/include/clang/Sema/DelayedDiagnostic.h
index 9de7131f74c7..0105089a393f 100644
--- a/clang/include/clang/Sema/DelayedDiagnostic.h
+++ b/clang/include/clang/Sema/DelayedDiagnostic.h
@@ -111,7 +111,9 @@ public:
   }
 
 private:
+  LLVM_PREFERRED_TYPE(AccessSpecifier)
   unsigned Access : 2;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMember : 1;
   NamedDecl *Target;
   CXXRecordDecl *NamingClass;
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 6ccabad3af54..9b342c091684 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -278,40 +278,50 @@ class Sema;
     /// Whether this is the deprecated conversion of a
     /// string literal to a pointer to non-const character data
     /// (C++ 4.2p2).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DeprecatedStringLiteralToCharPtr : 1;
 
     /// Whether the qualification conversion involves a change in the
     /// Objective-C lifetime (for automatic reference counting).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned QualificationIncludesObjCLifetime : 1;
 
     /// IncompatibleObjC - Whether this is an Objective-C conversion
     /// that we should warn about (if we actually use it).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IncompatibleObjC : 1;
 
     /// ReferenceBinding - True when this is a reference binding
     /// (C++ [over.ics.ref]).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned ReferenceBinding : 1;
 
     /// DirectBinding - True when this is a reference binding that is a
     /// direct binding (C++ [dcl.init.ref]).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned DirectBinding : 1;
 
     /// Whether this is an lvalue reference binding (otherwise, it's
     /// an rvalue reference binding).
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsLvalueReference : 1;
 
     /// Whether we're binding to a function lvalue.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToFunctionLvalue : 1;
 
     /// Whether we're binding to an rvalue.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToRvalue : 1;
 
     /// Whether this binds an implicit object argument to a
     /// non-static member function without a ref-qualifier.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned BindsImplicitObjectArgumentWithoutRefQualifier : 1;
 
     /// Whether this binds a reference to an object with a different
     /// Objective-C lifetime qualifier.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned ObjCLifetimeConversionBinding : 1;
 
     /// FromType - The type that this conversion is converting
@@ -541,9 +551,11 @@ class Sema;
     };
 
     /// ConversionKind - The kind of implicit conversion sequence.
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned ConversionKind : 31;
 
     // Whether the initializer list was of an incomplete array.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned InitializerListOfIncompleteArray : 1;
 
     /// When initializing an array or std::initializer_list from an
@@ -878,6 +890,7 @@ class Sema;
     CallExpr::ADLCallKind IsADLCandidate : 1;
 
     /// Whether this is a rewritten candidate, and if so, of what kind?
+    LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind)
     unsigned RewriteKind : 2;
 
     /// FailureKind - The reason why this candidate is not viable.
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 8c0edca1ebc5..8c3ba39031aa 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -82,7 +82,9 @@ struct AvailabilityData {
 
 struct TypeTagForDatatypeData {
   ParsedType MatchingCType;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned LayoutCompatible : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned MustBeNull : 1;
 };
 struct PropertyData {
@@ -149,33 +151,41 @@ private:
   unsigned NumArgs : 16;
 
   /// True if already diagnosed as invalid.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned Invalid : 1;
 
   /// True if this attribute was used as a type attribute.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned UsedAsTypeAttr : 1;
 
   /// True if this has the extra information associated with an
   /// availability attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsAvailability : 1;
 
   /// True if this has extra information associated with a
   /// type_tag_for_datatype attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsTypeTagForDatatype : 1;
 
   /// True if this has extra information associated with a
   /// Microsoft __delcspec(property) attribute.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsProperty : 1;
 
   /// True if this has a ParsedType
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasParsedType : 1;
 
   /// True if the processing cache is valid.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned HasProcessingCache : 1;
 
   /// A cached value.
   mutable unsigned ProcessingCache : 8;
 
   /// True if the attribute is specified using '#pragma clang attribute'.
+  LLVM_PREFERRED_TYPE(bool)
   mutable unsigned IsPragmaClangAttribute : 1;
 
   /// The location of the 'unavailable' keyword in an
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 6eaa74382685..700e361ef83f 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -97,6 +97,8 @@ public:
       : PD(PD), Loc(Loc), Stmts(Stmts) {}
 };
 
+enum class FirstCoroutineStmtKind { CoReturn, CoAwait, CoYield };
+
 /// Retains information about a function, method, or block that is
 /// currently being parsed.
 class FunctionScopeInfo {
@@ -170,6 +172,7 @@ public:
 
   /// An enumeration representing the kind of the first coroutine statement
   /// in the function. One of co_return, co_await, or co_yield.
+  LLVM_PREFERRED_TYPE(FirstCoroutineStmtKind)
   unsigned char FirstCoroutineStmtKind : 2;
 
   /// Whether we found an immediate-escalating expression.
@@ -502,22 +505,30 @@ public:
     assert(FirstCoroutineStmtLoc.isInvalid() &&
                    "first coroutine statement location already set");
     FirstCoroutineStmtLoc = Loc;
-    FirstCoroutineStmtKind = llvm::StringSwitch<unsigned char>(Keyword)
-            .Case("co_return", 0)
-            .Case("co_await", 1)
-            .Case("co_yield", 2);
+    FirstCoroutineStmtKind =
+        llvm::StringSwitch<unsigned char>(Keyword)
+            .Case("co_return",
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoReturn))
+            .Case("co_await",
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoAwait))
+            .Case("co_yield",
+                  llvm::to_underlying(FirstCoroutineStmtKind::CoYield));
   }
 
   StringRef getFirstCoroutineStmtKeyword() const {
     assert(FirstCoroutineStmtLoc.isValid()
                    && "no coroutine statement available");
-    switch (FirstCoroutineStmtKind) {
-    case 0: return "co_return";
-    case 1: return "co_await";
-    case 2: return "co_yield";
-    default:
-      llvm_unreachable("FirstCoroutineStmtKind has an invalid value");
+    auto Value =
+        static_cast<enum FirstCoroutineStmtKind>(FirstCoroutineStmtKind);
+    switch (Value) {
+    case FirstCoroutineStmtKind::CoReturn:
+      return "co_return";
+    case FirstCoroutineStmtKind::CoAwait:
+      return "co_await";
+    case FirstCoroutineStmtKind::CoYield:
+      return "co_yield";
     };
+    llvm_unreachable("FirstCoroutineStmtKind has an invalid value");
   }
 
   void setNeedsCoroutineSuspends(bool value = true) {
@@ -582,25 +593,31 @@ class Capture {
   QualType CaptureType;
 
   /// The CaptureKind of this capture.
+  LLVM_PREFERRED_TYPE(CaptureKind)
   unsigned Kind : 2;
 
   /// Whether this is a nested capture (a capture of an enclosing capturing
   /// scope's capture).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Nested : 1;
 
   /// Whether this is a capture of '*this'.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CapturesThis : 1;
 
   /// Whether an explicit capture has been odr-used in the body of the
   /// lambda.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ODRUsed : 1;
 
   /// Whether an explicit capture has been non-odr-used in the body of
   /// the lambda.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NonODRUsed : 1;
 
   /// Whether the capture is invalid (a capture was required but the entity is
   /// non-capturable).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Invalid : 1;
 
 public:
@@ -925,8 +942,8 @@ public:
   /// that were defined in parent contexts. Used to avoid warnings when the
   /// shadowed variables are uncaptured by this lambda.
   struct ShadowedOuterDecl {
-    const VarDecl *VD;
-    const VarDecl *ShadowedDecl;
+    const NamedDecl *VD;
+    const NamedDecl *ShadowedDecl;
   };
   llvm::SmallVector<ShadowedOuterDecl, 4> ShadowingDecls;
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 3c26003b5bda..ed933f27f8df 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -353,6 +353,72 @@ private:
   llvm::function_ref<QualType()> ComputeType;
 };
 
+/// Describes the result of template argument deduction.
+///
+/// The TemplateDeductionResult enumeration describes the result of
+/// template argument deduction, as returned from
+/// DeduceTemplateArguments(). The separate TemplateDeductionInfo
+/// structure provides additional information about the results of
+/// template argument deduction, e.g., the deduced template argument
+/// list (if successful) or the specific template parameters or
+/// deduced arguments that were involved in the failure.
+enum class TemplateDeductionResult {
+  /// Template argument deduction was successful.
+  Success = 0,
+  /// The declaration was invalid; do nothing.
+  Invalid,
+  /// Template argument deduction exceeded the maximum template
+  /// instantiation depth (which has already been diagnosed).
+  InstantiationDepth,
+  /// Template argument deduction did not deduce a value
+  /// for every template parameter.
+  Incomplete,
+  /// Template argument deduction did not deduce a value for every
+  /// expansion of an expanded template parameter pack.
+  IncompletePack,
+  /// Template argument deduction produced inconsistent
+  /// deduced values for the given template parameter.
+  Inconsistent,
+  /// Template argument deduction failed due to inconsistent
+  /// cv-qualifiers on a template parameter type that would
+  /// otherwise be deduced, e.g., we tried to deduce T in "const T"
+  /// but were given a non-const "X".
+  Underqualified,
+  /// Substitution of the deduced template argument values
+  /// resulted in an error.
+  SubstitutionFailure,
+  /// After substituting deduced template arguments, a dependent
+  /// parameter type did not match the corresponding argument.
+  DeducedMismatch,
+  /// After substituting deduced template arguments, an element of
+  /// a dependent parameter type did not match the corresponding element
+  /// of the corresponding argument (when deducing from an initializer list).
+  DeducedMismatchNested,
+  /// A non-depnedent component of the parameter did not match the
+  /// corresponding component of the argument.
+  NonDeducedMismatch,
+  /// When performing template argument deduction for a function
+  /// template, there were too many call arguments.
+  TooManyArguments,
+  /// When performing template argument deduction for a function
+  /// template, there were too few call arguments.
+  TooFewArguments,
+  /// The explicitly-specified template arguments were not valid
+  /// template arguments for the given template.
+  InvalidExplicitArguments,
+  /// Checking non-dependent argument conversions failed.
+  NonDependentConversionFailure,
+  /// The deduced arguments did not satisfy the constraints associated
+  /// with the template.
+  ConstraintsNotSatisfied,
+  /// Deduction failed; that's all we know.
+  MiscellaneousDeductionFailure,
+  /// CUDA Target attributes do not match.
+  CUDATargetMismatch,
+  /// Some error which was already diagnosed.
+  AlreadyDiagnosed
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 class Sema final {
   Sema(const Sema &) = delete;
@@ -3501,29 +3567,29 @@ public:
 
   /// For a defaulted function, the kind of defaulted function that it is.
   class DefaultedFunctionKind {
-    CXXSpecialMember SpecialMember : 8;
-    DefaultedComparisonKind Comparison : 8;
+    unsigned SpecialMember : 8;
+    unsigned Comparison : 8;
 
   public:
     DefaultedFunctionKind()
-        : SpecialMember(CXXInvalid), Comparison(DefaultedComparisonKind::None) {
+        : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) {
     }
     DefaultedFunctionKind(CXXSpecialMember CSM)
-        : SpecialMember(CSM), Comparison(DefaultedComparisonKind::None) {}
+        : SpecialMember(CSM), Comparison(llvm::to_underlying(DefaultedComparisonKind::None)) {}
     DefaultedFunctionKind(DefaultedComparisonKind Comp)
-        : SpecialMember(CXXInvalid), Comparison(Comp) {}
+        : SpecialMember(CXXInvalid), Comparison(llvm::to_underlying(Comp)) {}
 
     bool isSpecialMember() const { return SpecialMember != CXXInvalid; }
     bool isComparison() const {
-      return Comparison != DefaultedComparisonKind::None;
+      return static_cast<DefaultedComparisonKind>(Comparison) != DefaultedComparisonKind::None;
     }
 
     explicit operator bool() const {
       return isSpecialMember() || isComparison();
     }
 
-    CXXSpecialMember asSpecialMember() const { return SpecialMember; }
-    DefaultedComparisonKind asComparison() const { return Comparison; }
+    CXXSpecialMember asSpecialMember() const { return static_cast<CXXSpecialMember>(SpecialMember); }
+    DefaultedComparisonKind asComparison() const { return static_cast<DefaultedComparisonKind>(Comparison); }
 
     /// Get the index of this function kind for use in diagnostics.
     unsigned getDiagnosticIndex() const {
@@ -3531,7 +3597,7 @@ public:
                     "invalid should have highest index");
       static_assert((unsigned)DefaultedComparisonKind::None == 0,
                     "none should be equal to zero");
-      return SpecialMember + (unsigned)Comparison;
+      return SpecialMember + Comparison;
     }
   };
 
@@ -4838,13 +4904,12 @@ public:
   llvm::Error isValidSectionSpecifier(StringRef Str);
   bool checkSectionName(SourceLocation LiteralLoc, StringRef Str);
   bool checkTargetAttr(SourceLocation LiteralLoc, StringRef Str);
-  bool checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &Str,
-                              bool &isDefault);
-  bool
-  checkTargetClonesAttrString(SourceLocation LiteralLoc, StringRef Str,
-                              const StringLiteral *Literal, bool &HasDefault,
-                              bool &HasCommas, bool &HasNotDefault,
-                              SmallVectorImpl<SmallString<64>> &StringsBuffer);
+  bool checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
+                              StringRef &Str, bool &isDefault);
+  bool checkTargetClonesAttrString(
+      SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
+      Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
+      SmallVectorImpl<SmallString<64>> &StringsBuffer);
   bool checkMSInheritanceAttrOnDefinition(
       CXXRecordDecl *RD, SourceRange Range, bool BestCase,
       MSInheritanceModel SemanticSpelling);
@@ -9262,72 +9327,6 @@ public:
   QualType adjustCCAndNoReturn(QualType ArgFunctionType, QualType FunctionType,
                                bool AdjustExceptionSpec = false);
 
-  /// Describes the result of template argument deduction.
-  ///
-  /// The TemplateDeductionResult enumeration describes the result of
-  /// template argument deduction, as returned from
-  /// DeduceTemplateArguments(). The separate TemplateDeductionInfo
-  /// structure provides additional information about the results of
-  /// template argument deduction, e.g., the deduced template argument
-  /// list (if successful) or the specific template parameters or
-  /// deduced arguments that were involved in the failure.
-  enum TemplateDeductionResult {
-    /// Template argument deduction was successful.
-    TDK_Success = 0,
-    /// The declaration was invalid; do nothing.
-    TDK_Invalid,
-    /// Template argument deduction exceeded the maximum template
-    /// instantiation depth (which has already been diagnosed).
-    TDK_InstantiationDepth,
-    /// Template argument deduction did not deduce a value
-    /// for every template parameter.
-    TDK_Incomplete,
-    /// Template argument deduction did not deduce a value for every
-    /// expansion of an expanded template parameter pack.
-    TDK_IncompletePack,
-    /// Template argument deduction produced inconsistent
-    /// deduced values for the given template parameter.
-    TDK_Inconsistent,
-    /// Template argument deduction failed due to inconsistent
-    /// cv-qualifiers on a template parameter type that would
-    /// otherwise be deduced, e.g., we tried to deduce T in "const T"
-    /// but were given a non-const "X".
-    TDK_Underqualified,
-    /// Substitution of the deduced template argument values
-    /// resulted in an error.
-    TDK_SubstitutionFailure,
-    /// After substituting deduced template arguments, a dependent
-    /// parameter type did not match the corresponding argument.
-    TDK_DeducedMismatch,
-    /// After substituting deduced template arguments, an element of
-    /// a dependent parameter type did not match the corresponding element
-    /// of the corresponding argument (when deducing from an initializer list).
-    TDK_DeducedMismatchNested,
-    /// A non-depnedent component of the parameter did not match the
-    /// corresponding component of the argument.
-    TDK_NonDeducedMismatch,
-    /// When performing template argument deduction for a function
-    /// template, there were too many call arguments.
-    TDK_TooManyArguments,
-    /// When performing template argument deduction for a function
-    /// template, there were too few call arguments.
-    TDK_TooFewArguments,
-    /// The explicitly-specified template arguments were not valid
-    /// template arguments for the given template.
-    TDK_InvalidExplicitArguments,
-    /// Checking non-dependent argument conversions failed.
-    TDK_NonDependentConversionFailure,
-    /// The deduced arguments did not satisfy the constraints associated
-    /// with the template.
-    TDK_ConstraintsNotSatisfied,
-    /// Deduction failed; that's all we know.
-    TDK_MiscellaneousDeductionFailure,
-    /// CUDA Target attributes do not match.
-    TDK_CUDATargetMismatch,
-    /// Some error which was already diagnosed.
-    TDK_AlreadyDiagnosed
-  };
-
   TemplateDeductionResult
   DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
                           ArrayRef<TemplateArgument> TemplateArgs,
@@ -14445,7 +14444,7 @@ public:
 };
 
 DeductionFailureInfo
-MakeDeductionFailureInfo(ASTContext &Context, Sema::TemplateDeductionResult TDK,
+MakeDeductionFailureInfo(ASTContext &Context, TemplateDeductionResult TDK,
                          sema::TemplateDeductionInfo &Info);
 
 /// Contains a late templated function.
diff --git a/clang/include/clang/Sema/TemplateDeduction.h b/clang/include/clang/Sema/TemplateDeduction.h
index 85691c66a044..28b014fd84e4 100644
--- a/clang/include/clang/Sema/TemplateDeduction.h
+++ b/clang/include/clang/Sema/TemplateDeduction.h
@@ -33,6 +33,7 @@ namespace clang {
 class Decl;
 struct DeducedPack;
 class Sema;
+enum class TemplateDeductionResult;
 
 namespace sema {
 
@@ -295,6 +296,10 @@ struct DeductionFailureInfo {
 
   /// Free any memory associated with this deduction failure.
   void Destroy();
+
+  TemplateDeductionResult getResult() const {
+    return static_cast<TemplateDeductionResult>(Result);
+  }
 };
 
 /// TemplateSpecCandidate - This is a generalization of OverloadCandidate
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 9de925163599..f31efa5117f0 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2018,6 +2018,9 @@ enum StmtCode {
 
   // SYCLUniqueStableNameExpr
   EXPR_SYCL_UNIQUE_STABLE_NAME,
+
+  // OpenACC Constructs
+  STMT_OPENACC_COMPUTE_CONSTRUCT,
 };
 
 /// The kinds of designators that can occur in a
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index a4c7f54ab9e8..2002bf23c959 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -721,6 +721,7 @@ private:
     unsigned ID;
 
     /// Whether this is a wildcard export.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsWildcard : 1;
 
     /// String data.
diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
index 30bf36edb7bf..ef9d6c15724b 100644
--- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h
+++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
@@ -554,7 +554,9 @@ struct RVVIntrinsicRecord {
   bool HasMaskPolicy : 1;
   bool HasFRMRoundModeOp : 1;
   bool IsTuple : 1;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t UnMaskedPolicyScheme : 2;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t MaskedPolicyScheme : 2;
 };
 
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index 051363b075de..13ad25308649 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -166,7 +166,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
                     const Module *Imported) override;
diff --git a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
index c91e4a6b0ac5..d167b7e45467 100644
--- a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
+++ b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
@@ -164,7 +164,7 @@ struct IncludeStyle {
 
   /// When guessing whether a #include is the "main" include, only the include
   /// directives that use the specified character are considered.
-  /// \version 18
+  /// \version 19
   MainIncludeCharDiscriminator MainIncludeChar;
 };
 
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 794526bc289c..acd960c9c932 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -54,7 +54,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsNEON.def"
   textual header "clang/Basic/BuiltinsNVPTX.def"
   textual header "clang/Basic/BuiltinsPPC.def"
-  textual header "clang/Basic/BuiltinsRISCV.def"
   textual header "clang/Basic/BuiltinsRISCVVector.def"
   textual header "clang/Basic/BuiltinsSME.def"
   textual header "clang/Basic/BuiltinsSVE.def"
@@ -81,6 +80,7 @@ module Clang_Basic {
   textual header "clang/Basic/RISCVVTypes.def"
   textual header "clang/Basic/Sanitizers.def"
   textual header "clang/Basic/TargetCXXABI.def"
+  textual header "clang/Basic/TargetOSMacros.def"
   textual header "clang/Basic/TransformTypeTraits.def"
   textual header "clang/Basic/TokenKinds.def"
   textual header "clang/Basic/WebAssemblyReferenceTypes.def"
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3b7ebbbd89ea..fe6e03ce174e 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -74,6 +74,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TemplateName.h"
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index ebcb3952198a..49dcf2e4da3e 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -112,6 +112,7 @@ add_clang_library(clangAST
   StmtCXX.cpp
   StmtIterator.cpp
   StmtObjC.cpp
+  StmtOpenACC.cpp
   StmtOpenMP.cpp
   StmtPrinter.cpp
   StmtProfile.cpp
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 26fdfa040796..5d6bb72a208a 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3537,10 +3537,23 @@ bool FunctionDecl::isTargetMultiVersion() const {
          (hasAttr<TargetAttr>() || hasAttr<TargetVersionAttr>());
 }
 
+bool FunctionDecl::isTargetMultiVersionDefault() const {
+  if (!isMultiVersion())
+    return false;
+  if (hasAttr<TargetAttr>())
+    return getAttr<TargetAttr>()->isDefaultVersion();
+  return hasAttr<TargetVersionAttr>() &&
+         getAttr<TargetVersionAttr>()->isDefaultVersion();
+}
+
 bool FunctionDecl::isTargetClonesMultiVersion() const {
   return isMultiVersion() && hasAttr<TargetClonesAttr>();
 }
 
+bool FunctionDecl::isTargetVersionMultiVersion() const {
+  return isMultiVersion() && hasAttr<TargetVersionAttr>();
+}
+
 void
 FunctionDecl::setPreviousDeclaration(FunctionDecl *PrevDecl) {
   redeclarable_base::setPreviousDecl(PrevDecl);
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 822ac12c4c7d..43d221968ea3 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1215,6 +1215,10 @@ void DeclPrinter::printTemplateParameters(const TemplateParameterList *Params,
                                           bool OmitTemplateKW) {
   assert(Params);
 
+  // Don't print invented template parameter lists.
+  if (!Params->empty() && Params->getParam(0)->isImplicit())
+    return;
+
   if (!OmitTemplateKW)
     Out << "template ";
   Out << '<';
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index d665a08deb47..8b10e2895832 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -3328,6 +3328,12 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef,
            DIUE->getUpdater()->isConstantInitializer(Ctx, false, Culprit);
   }
   case InitListExprClass: {
+    // C++ [dcl.init.aggr]p2:
+    //   The elements of an aggregate are:
+    //   - for an array, the array elements in increasing subscript order, or
+    //   - for a class, the direct base classes in declaration order, followed
+    //     by the direct non-static data members (11.4) that are not members of
+    //     an anonymous union, in declaration order.
     const InitListExpr *ILE = cast<InitListExpr>(this);
     assert(ILE->isSemanticForm() && "InitListExpr must be in semantic form");
     if (ILE->getType()->isArrayType()) {
@@ -3342,6 +3348,19 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef,
     if (ILE->getType()->isRecordType()) {
       unsigned ElementNo = 0;
       RecordDecl *RD = ILE->getType()->castAs<RecordType>()->getDecl();
+
+      // In C++17, bases were added to the list of members used by aggregate
+      // initialization.
+      if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+        for (unsigned i = 0, e = CXXRD->getNumBases(); i < e; i++) {
+          if (ElementNo < ILE->getNumInits()) {
+            const Expr *Elt = ILE->getInit(ElementNo++);
+            if (!Elt->isConstantInitializer(Ctx, false, Culprit))
+              return false;
+          }
+        }
+      }
+
       for (const auto *Field : RD->fields()) {
         // If this is a union, skip all the fields that aren't being initialized.
         if (RD->isUnion() && ILE->getInitializedFieldInUnion() != Field)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 089bc2094567..33ad94e6795c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -240,15 +240,19 @@ namespace {
     /// True if the subobject was named in a manner not supported by C++11. Such
     /// lvalues can still be folded, but they are not core constant expressions
     /// and we cannot perform lvalue-to-rvalue conversions on them.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Invalid : 1;
 
     /// Is this a pointer one past the end of an object?
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsOnePastTheEnd : 1;
 
     /// Indicator of whether the first entry is an unsized array.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned FirstEntryIsAnUnsizedArray : 1;
 
     /// Indicator of whether the most-derived object is an array element.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned MostDerivedIsArrayElement : 1;
 
     /// The length of the path to the most-derived object of which this is a
@@ -8006,7 +8010,8 @@ public:
           assert(CorrespondingCallOpSpecialization &&
                  "We must always have a function call operator specialization "
                  "that corresponds to our static invoker specialization");
-          FD = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
+          assert(isa<CXXMethodDecl>(CorrespondingCallOpSpecialization));
+          FD = CorrespondingCallOpSpecialization;
         } else
           FD = LambdaCallOp;
       } else if (FD->isReplaceableGlobalAllocationFunction()) {
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index 8bbfa928bd64..e697e24fb341 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -23,6 +23,34 @@ using namespace clang;
 using namespace clang::interp;
 
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
+  bool IsLambdaStaticInvoker = false;
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl);
+      MD && MD->isLambdaStaticInvoker()) {
+    // For a lambda static invoker, we might have to pick a specialized
+    // version if the lambda is generic. In that case, the picked function
+    // will *NOT* be a static invoker anymore. However, it will still
+    // be a non-static member function, this (usually) requiring an
+    // instance pointer. We suppress that later in this function.
+    IsLambdaStaticInvoker = true;
+
+    const CXXRecordDecl *ClosureClass = MD->getParent();
+    assert(ClosureClass->captures_begin() == ClosureClass->captures_end());
+    if (ClosureClass->isGenericLambda()) {
+      const CXXMethodDecl *LambdaCallOp = ClosureClass->getLambdaCallOperator();
+      assert(MD->isFunctionTemplateSpecialization() &&
+             "A generic lambda's static-invoker function must be a "
+             "template specialization");
+      const TemplateArgumentList *TAL = MD->getTemplateSpecializationArgs();
+      FunctionTemplateDecl *CallOpTemplate =
+          LambdaCallOp->getDescribedFunctionTemplate();
+      void *InsertPos = nullptr;
+      const FunctionDecl *CorrespondingCallOpSpecialization =
+          CallOpTemplate->findSpecialization(TAL->asArray(), InsertPos);
+      assert(CorrespondingCallOpSpecialization);
+      FuncDecl = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
+    }
+  }
+
   // Set up argument indices.
   unsigned ParamOffset = 0;
   SmallVector<PrimType, 8> ParamTypes;
@@ -46,7 +74,7 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // InterpStack when calling the function.
   bool HasThisPointer = false;
   if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl)) {
-    if (MD->isImplicitObjectMemberFunction()) {
+    if (MD->isImplicitObjectMemberFunction() && !IsLambdaStaticInvoker) {
       HasThisPointer = true;
       ParamTypes.push_back(PT_Ptr);
       ParamOffsets.push_back(ParamOffset);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 59fddfc2da19..91b9985eefbd 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -167,7 +167,9 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return this->emitNull(classifyPrim(CE->getType()), CE);
 
   case CK_PointerToIntegral: {
-    // TODO: Discard handling.
+    if (DiscardResult)
+      return this->discard(SubExpr);
+
     if (!this->visit(SubExpr))
       return false;
 
@@ -464,7 +466,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   // Special case for C++'s three-way/spaceship operator <=>, which
   // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't
   // have a PrimType).
-  if (!T && Ctx.getLangOpts().CPlusPlus) {
+  if (!T && BO->getOpcode() == BO_Cmp) {
     if (DiscardResult)
       return true;
     const ComparisonCategoryInfo *CmpInfo =
@@ -1750,8 +1752,7 @@ bool ByteCodeExprGen<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
   if (DiscardResult)
     return true;
 
-  assert(!Initializing);
-  return this->visit(E->getFunctionName());
+  return this->delegate(E->getFunctionName());
 }
 
 template <class Emitter>
@@ -2020,6 +2021,117 @@ bool ByteCodeExprGen<Emitter>::VisitObjCBoolLiteralExpr(
   return this->emitConst(E->getValue(), E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXInheritedCtorInitExpr(
+    const CXXInheritedCtorInitExpr *E) {
+  const CXXConstructorDecl *Ctor = E->getConstructor();
+  assert(!Ctor->isTrivial() &&
+         "Trivial CXXInheritedCtorInitExpr, implement. (possible?)");
+  const Function *F = this->getFunction(Ctor);
+  assert(F);
+  assert(!F->hasRVO());
+  assert(F->hasThisPointer());
+
+  if (!this->emitDupPtr(SourceInfo{}))
+    return false;
+
+  // Forward all arguments of the current function (which should be a
+  // constructor itself) to the inherited ctor.
+  // This is necessary because the calling code has pushed the pointer
+  // of the correct base for  us already, but the arguments need
+  // to come after.
+  unsigned Offset = align(primSize(PT_Ptr)); // instance pointer.
+  for (const ParmVarDecl *PD : Ctor->parameters()) {
+    PrimType PT = this->classify(PD->getType()).value_or(PT_Ptr);
+
+    if (!this->emitGetParam(PT, Offset, E))
+      return false;
+    Offset += align(primSize(PT));
+  }
+
+  return this->emitCall(F, E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitExpressionTraitExpr(
+    const ExpressionTraitExpr *E) {
+  assert(Ctx.getLangOpts().CPlusPlus);
+  return this->emitConstBool(E->getValue(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
+  if (DiscardResult)
+    return true;
+  assert(!Initializing);
+
+  std::optional<unsigned> GlobalIndex = P.getOrCreateGlobal(E->getGuidDecl());
+  if (!GlobalIndex)
+    return false;
+  if (!this->emitGetPtrGlobal(*GlobalIndex, E))
+    return false;
+
+  const Record *R = this->getRecord(E->getType());
+  assert(R);
+
+  const APValue &V = E->getGuidDecl()->getAsAPValue();
+  if (V.getKind() == APValue::None)
+    return true;
+
+  assert(V.isStruct());
+  assert(V.getStructNumBases() == 0);
+  // FIXME: This could be useful in visitAPValue, too.
+  for (unsigned I = 0, N = V.getStructNumFields(); I != N; ++I) {
+    const APValue &F = V.getStructField(I);
+    const Record::Field *RF = R->getField(I);
+
+    if (F.isInt()) {
+      PrimType T = classifyPrim(RF->Decl->getType());
+      if (!this->visitAPValue(F, T, E))
+        return false;
+      if (!this->emitInitField(T, RF->Offset, E))
+        return false;
+    } else if (F.isArray()) {
+      assert(RF->Desc->isPrimitiveArray());
+      const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
+      PrimType ElemT = classifyPrim(ArrType->getElementType());
+      assert(ArrType);
+
+      if (!this->emitDupPtr(E))
+        return false;
+      if (!this->emitGetPtrField(RF->Offset, E))
+        return false;
+
+      for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
+        if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
+          return false;
+        if (!this->emitInitElem(ElemT, A, E))
+          return false;
+      }
+
+      if (!this->emitPopPtr(E))
+        return false;
+    } else {
+      assert(false && "I don't think this should be possible");
+    }
+  }
+
+  return this->emitInitPtr(E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
+  assert(classifyPrim(E->getType()) == PT_Bool);
+  return this->emitConstBool(E->isSatisfied(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitConceptSpecializationExpr(
+    const ConceptSpecializationExpr *E) {
+  assert(classifyPrim(E->getType()) == PT_Bool);
+  return this->emitConstBool(E->isSatisfied(), E);
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
@@ -2246,8 +2358,7 @@ bool ByteCodeExprGen<Emitter>::dereferenceParam(
     const Expr *LV, PrimType T, const ParmVarDecl *PD, DerefKind AK,
     llvm::function_ref<bool(PrimType)> Direct,
     llvm::function_ref<bool(PrimType)> Indirect) {
-  auto It = this->Params.find(PD);
-  if (It != this->Params.end()) {
+  if (auto It = this->Params.find(PD); It != this->Params.end()) {
     unsigned Idx = It->second.Offset;
     switch (AK) {
     case DerefKind::Read:
@@ -2518,10 +2629,13 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
   // For us, that means everything we don't
   // have a PrimType for.
   if (std::optional<unsigned> LocalOffset = this->allocateLocal(E)) {
-    if (!this->visitLocalInitializer(E, *LocalOffset))
+    if (!this->emitGetPtrLocal(*LocalOffset, E))
       return false;
 
-    if (!this->emitGetPtrLocal(*LocalOffset, E))
+    if (!visitInitializer(E))
+      return false;
+
+    if (!this->emitInitPtr(E))
       return false;
     return this->emitRetValue(E);
   }
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 2c9cca5082b1..eeb56dc84565 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -111,6 +111,11 @@ public:
   bool VisitGenericSelectionExpr(const GenericSelectionExpr *E);
   bool VisitChooseExpr(const ChooseExpr *E);
   bool VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *E);
+  bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
+  bool VisitExpressionTraitExpr(const ExpressionTraitExpr *E);
+  bool VisitCXXUuidofExpr(const CXXUuidofExpr *E);
+  bool VisitRequiresExpr(const RequiresExpr *E);
+  bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index b0ec90a1f285..bedcc78dc235 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -144,6 +144,10 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
 
   auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset,
                                   const Expr *InitExpr) -> bool {
+    // We don't know what to do with these, so just return false.
+    if (InitExpr->getType().isNull())
+      return false;
+
     if (std::optional<PrimType> T = this->classify(InitExpr)) {
       if (!this->visit(InitExpr))
         return false;
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 6cca9d5feede..6a53205af599 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -59,17 +59,22 @@ struct InlineDescriptor {
 
   /// Flag indicating if the storage is constant or not.
   /// Relevant for primitive fields.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsConst : 1;
   /// For primitive fields, it indicates if the field was initialized.
   /// Primitive fields in static storage are always initialized.
   /// Arrays are always initialized, even though their elements might not be.
   /// Base classes are initialized after the constructor is invoked.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsInitialized : 1;
   /// Flag indicating if the field is an embedded base class.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsBase : 1;
   /// Flag indicating if the field is the active member of a union.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsActive : 1;
   /// Flag indicating if the field is mutable (if in a record).
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsFieldMutable : 1;
 
   const Descriptor *Desc;
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index a60f893de8bd..945b78d7a609 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -36,7 +36,7 @@ EvalEmitter::~EvalEmitter() {
 EvaluationResult EvalEmitter::interpretExpr(const Expr *E) {
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E))
+  if (!this->visitExpr(E) && EvalResult.empty())
     EvalResult.setInvalid();
 
   return std::move(this->EvalResult);
@@ -45,7 +45,7 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E) {
 EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD) {
   EvalResult.setSource(VD);
 
-  if (!this->visitDecl(VD))
+  if (!this->visitDecl(VD) && EvalResult.empty())
     EvalResult.setInvalid();
 
   return std::move(this->EvalResult);
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index a76e63395157..e2fda18e3f44 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -811,7 +811,7 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) {
   const auto *CmpValueInfo = CmpInfo->getValueInfo(CmpResult);
   assert(CmpValueInfo);
   assert(CmpValueInfo->hasValidIntValue());
-  APSInt IntValue = CmpValueInfo->getIntValue();
+  const APSInt &IntValue = CmpValueInfo->getIntValue();
   return SetThreeWayComparisonField(S, OpPC, P, IntValue);
 }
 
@@ -1187,15 +1187,18 @@ inline bool GetPtrGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 /// 2) Pushes Pointer.atField(Off) on the stack
 inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
+
   if (S.inConstantContext() && !CheckNull(S, OpPC, Ptr, CSK_Field))
     return false;
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
 
+  if (CheckDummy(S, OpPC, Ptr)) {
+    if (!CheckExtern(S, OpPC, Ptr))
+      return false;
+    if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+      return false;
+    if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
+      return false;
+  }
   S.Stk.push<Pointer>(Ptr.atField(Off));
   return true;
 }
@@ -1278,13 +1281,16 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool InitPtrPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  Ptr.initialize();
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
   return true;
 }
 
 inline bool InitPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
-  Ptr.initialize();
+
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
   return true;
 }
 
@@ -1856,7 +1862,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
   if (!CheckDummy(S, OpPC, Ptr))
-    return false;
+    return true;
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
@@ -1869,8 +1875,10 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!CheckDummy(S, OpPC, Ptr))
-    return false;
+  if (!CheckDummy(S, OpPC, Ptr)) {
+    S.Stk.push<Pointer>(Ptr);
+    return true;
+  }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
@@ -1891,6 +1899,11 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy()) {
+    S.Stk.push<Pointer>(Ptr);
+    return true;
+  }
+
   if (!Ptr.isUnknownSizeArray()) {
     S.Stk.push<Pointer>(Ptr.atIndex(0));
     return true;
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 316a7ed5fa1b..3f85635f4367 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -83,63 +83,48 @@ void Pointer::operator=(Pointer &&P) {
 }
 
 APValue Pointer::toAPValue() const {
-  APValue::LValueBase Base;
   llvm::SmallVector<APValue::LValuePathEntry, 5> Path;
-  CharUnits Offset;
-  bool IsNullPtr;
-  bool IsOnePastEnd;
-
-  if (isZero()) {
-    Base = static_cast<const Expr *>(nullptr);
-    IsNullPtr = true;
-    IsOnePastEnd = false;
-    Offset = CharUnits::Zero();
-  } else {
-    // Build the lvalue base from the block.
-    const Descriptor *Desc = getDeclDesc();
-    if (auto *VD = Desc->asValueDecl())
-      Base = VD;
-    else if (auto *E = Desc->asExpr())
-      Base = E;
-    else
-      llvm_unreachable("Invalid allocation type");
-
-    // Not a null pointer.
-    IsNullPtr = false;
-
-    if (isUnknownSizeArray()) {
-      IsOnePastEnd = false;
-      Offset = CharUnits::Zero();
-    } else if (Desc->asExpr()) {
-      // Pointer pointing to a an expression.
-      IsOnePastEnd = false;
-      Offset = CharUnits::Zero();
+
+  if (isZero())
+    return APValue(static_cast<const Expr *>(nullptr), CharUnits::Zero(), Path,
+                   /*IsOnePastEnd=*/false, /*IsNullPtr=*/true);
+
+  // Build the lvalue base from the block.
+  const Descriptor *Desc = getDeclDesc();
+  APValue::LValueBase Base;
+  if (const auto *VD = Desc->asValueDecl())
+    Base = VD;
+  else if (const auto *E = Desc->asExpr())
+    Base = E;
+  else
+    llvm_unreachable("Invalid allocation type");
+
+  if (isDummy() || isUnknownSizeArray() || Desc->asExpr())
+    return APValue(Base, CharUnits::Zero(), Path,
+                   /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
+
+  // TODO: compute the offset into the object.
+  CharUnits Offset = CharUnits::Zero();
+  bool IsOnePastEnd = isOnePastEnd();
+
+  // Build the path into the object.
+  Pointer Ptr = *this;
+  while (Ptr.isField() || Ptr.isArrayElement()) {
+    if (Ptr.isArrayElement()) {
+      Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
+      Ptr = Ptr.getArray();
     } else {
-      // TODO: compute the offset into the object.
-      Offset = CharUnits::Zero();
-
-      // Build the path into the object.
-      Pointer Ptr = *this;
-      while (Ptr.isField() || Ptr.isArrayElement()) {
-        if (Ptr.isArrayElement()) {
-          Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
-          Ptr = Ptr.getArray();
-        } else {
-          // TODO: figure out if base is virtual
-          bool IsVirtual = false;
-
-          // Create a path entry for the field.
-          const Descriptor *Desc = Ptr.getFieldDesc();
-          if (const auto *BaseOrMember = Desc->asDecl()) {
-            Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
-            Ptr = Ptr.getBase();
-            continue;
-          }
-          llvm_unreachable("Invalid field type");
-        }
+      // TODO: figure out if base is virtual
+      bool IsVirtual = false;
+
+      // Create a path entry for the field.
+      const Descriptor *Desc = Ptr.getFieldDesc();
+      if (const auto *BaseOrMember = Desc->asDecl()) {
+        Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
+        Ptr = Ptr.getBase();
+        continue;
       }
-
-      IsOnePastEnd = isOnePastEnd();
+      llvm_unreachable("Invalid field type");
     }
   }
 
@@ -149,7 +134,7 @@ APValue Pointer::toAPValue() const {
   // Just invert the order of the elements.
   std::reverse(Path.begin(), Path.end());
 
-  return APValue(Base, Offset, Path, IsOnePastEnd, IsNullPtr);
+  return APValue(Base, Offset, Path, IsOnePastEnd, /*IsNullPtr=*/false);
 }
 
 std::string Pointer::toDiagnosticString(const ASTContext &Ctx) const {
@@ -247,11 +232,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
 
     // Primitive values.
     if (std::optional<PrimType> T = Ctx.classify(Ty)) {
-      if (T == PT_Ptr || T == PT_FnPtr) {
-        R = Ptr.toAPValue();
-      } else {
-        TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
-      }
+      TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
       return true;
     }
 
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index b2b478af2e73..964c0377c6dc 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -169,7 +169,7 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
   if (const auto *Var = dyn_cast<VarDecl>(VD)) {
     IsStatic = Context::shouldBeGloballyIndexed(VD);
     IsExtern = !Var->getAnyInitializer();
-  } else if (isa<UnnamedGlobalConstantDecl>(VD)) {
+  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl>(VD)) {
     IsStatic = true;
     IsExtern = false;
   } else {
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 17342680102c..364a63dbf477 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -86,7 +86,7 @@ public:
   std::optional<unsigned> getOrCreateDummy(const ValueDecl *VD);
 
   /// Creates a global and returns its index.
-  std::optional<unsigned> createGlobal(const ValueDecl *VD, const Expr *E);
+  std::optional<unsigned> createGlobal(const ValueDecl *VD, const Expr *Init);
 
   /// Creates a global from a lifetime-extended temporary.
   std::optional<unsigned> createGlobal(const Expr *E);
diff --git a/clang/lib/AST/Linkage.h b/clang/lib/AST/Linkage.h
index 31f384eb75d0..e4dcb5e53261 100644
--- a/clang/lib/AST/Linkage.h
+++ b/clang/lib/AST/Linkage.h
@@ -29,12 +29,15 @@ namespace clang {
 struct LVComputationKind {
   /// The kind of entity whose visibility is ultimately being computed;
   /// visibility computations for types and non-types follow different rules.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ExplicitKind : 1;
   /// Whether explicit visibility attributes should be ignored. When set,
   /// visibility may only be restricted by the visibility of template arguments.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IgnoreExplicitVisibility : 1;
   /// Whether all visibility should be ignored. When set, we're only interested
   /// in computing linkage.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IgnoreAllVisibility : 1;
 
   enum { NumLVComputationKindBits = 3 };
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 6dfaadd92e79..a3b7431f7ffd 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -602,21 +602,28 @@ protected:
 
   /// Whether the external AST source has provided a layout for this
   /// record.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned UseExternalLayout : 1;
 
   /// Whether we need to infer alignment, even when we have an
   /// externally-provided layout.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned InferAlignment : 1;
 
   /// Packed - Whether the record is packed or not.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Packed : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnion : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMac68kAlign : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsNaturalAlign : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsMsStruct : 1;
 
   /// UnfilledBitsInLastUnit - If the last field laid out was a bitfield,
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index afd05881cb16..fe59d6070b3e 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -23,6 +23,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/CharInfo.h"
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
new file mode 100644
index 000000000000..1a99c2463818
--- /dev/null
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -0,0 +1,33 @@
+//===--- StmtOpenACC.cpp - Classes for OpenACC Constructs -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the subclesses of Stmt class declared in StmtOpenACC.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/StmtOpenACC.h"
+#include "clang/AST/ASTContext.h"
+using namespace clang;
+
+OpenACCComputeConstruct *
+OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, EmptyShell) {
+  void *Mem = C.Allocate(sizeof(OpenACCComputeConstruct),
+                         alignof(OpenACCComputeConstruct));
+  auto *Inst = new (Mem) OpenACCComputeConstruct;
+  return Inst;
+}
+
+OpenACCComputeConstruct *
+OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K,
+                                SourceLocation BeginLoc,
+                                SourceLocation EndLoc) {
+  void *Mem = C.Allocate(sizeof(OpenACCComputeConstruct),
+                         alignof(OpenACCComputeConstruct));
+  auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, EndLoc);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 1df040e06db3..d66c3ccce209 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1138,6 +1138,15 @@ void StmtPrinter::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+//  OpenACC construct printing methods
+//===----------------------------------------------------------------------===//
+void StmtPrinter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  Indent() << "#pragma acc " << S->getDirectiveKind();
+  // TODO OpenACC: Print Clauses.
+  PrintStmt(S->getStructuredBlock());
+}
+
+//===----------------------------------------------------------------------===//
 //  Expr printing methods.
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 1b817cf58b99..b545ff472e5a 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2441,6 +2441,13 @@ void StmtProfiler::VisitTemplateArgument(const TemplateArgument &Arg) {
   }
 }
 
+void StmtProfiler::VisitOpenACCComputeConstruct(
+    const OpenACCComputeConstruct *S) {
+  // VisitStmt handles children, so the AssociatedStmt is handled.
+  VisitStmt(S);
+  // TODO OpenACC: Visit Clauses.
+}
+
 void Stmt::Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
                    bool Canonical, bool ProfileLambdaExpr) const {
   StmtProfilerWithPointers Profiler(ID, Context, Canonical, ProfileLambdaExpr);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 0000d26dd49e..b683eb1edd8f 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2668,3 +2668,8 @@ void TextNodeDumper::VisitHLSLBufferDecl(const HLSLBufferDecl *D) {
     OS << " tbuffer";
   dumpName(D);
 }
+
+void TextNodeDumper::VisitOpenACCConstructStmt(const OpenACCConstructStmt *S) {
+  OS << " " << S->getDirectiveKind();
+  // TODO OpenACC: Dump clauses as well.
+}
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 24811ded970e..d487944ce921 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -887,34 +887,10 @@ Value *Environment::createValueUnlessSelfReferential(
 
   if (Type->isRecordType()) {
     CreatedValuesCount++;
-    llvm::DenseMap<const ValueDecl *, StorageLocation *> FieldLocs;
-    for (const FieldDecl *Field : DACtx->getModeledFields(Type)) {
-      assert(Field != nullptr);
+    auto &Loc = cast<RecordStorageLocation>(createStorageLocation(Type));
+    initializeFieldsWithValues(Loc, Visited, Depth, CreatedValuesCount);
 
-      QualType FieldType = Field->getType();
-
-      FieldLocs.insert(
-          {Field, &createLocAndMaybeValue(FieldType, Visited, Depth + 1,
-                                          CreatedValuesCount)});
-    }
-
-    RecordStorageLocation::SyntheticFieldMap SyntheticFieldLocs;
-    for (const auto &Entry : DACtx->getSyntheticFields(Type)) {
-      SyntheticFieldLocs.insert(
-          {Entry.getKey(),
-           &createLocAndMaybeValue(Entry.getValue(), Visited, Depth + 1,
-                                   CreatedValuesCount)});
-    }
-
-    RecordStorageLocation &Loc = DACtx->createRecordStorageLocation(
-        Type, std::move(FieldLocs), std::move(SyntheticFieldLocs));
-    RecordValue &RecordVal = create<RecordValue>(Loc);
-
-    // As we already have a storage location for the `RecordValue`, we can and
-    // should associate them in the environment.
-    setValue(Loc, RecordVal);
-
-    return &RecordVal;
+    return &refreshRecordValue(Loc, *this);
   }
 
   return nullptr;
@@ -943,6 +919,50 @@ Environment::createLocAndMaybeValue(QualType Ty,
   return Loc;
 }
 
+void Environment::initializeFieldsWithValues(RecordStorageLocation &Loc,
+                                             llvm::DenseSet<QualType> &Visited,
+                                             int Depth,
+                                             int &CreatedValuesCount) {
+  auto initField = [&](QualType FieldType, StorageLocation &FieldLoc) {
+    if (FieldType->isRecordType()) {
+      auto &FieldRecordLoc = cast<RecordStorageLocation>(FieldLoc);
+      setValue(FieldRecordLoc, create<RecordValue>(FieldRecordLoc));
+      initializeFieldsWithValues(FieldRecordLoc, Visited, Depth + 1,
+                                 CreatedValuesCount);
+    } else {
+      if (!Visited.insert(FieldType.getCanonicalType()).second)
+        return;
+      if (Value *Val = createValueUnlessSelfReferential(
+              FieldType, Visited, Depth + 1, CreatedValuesCount))
+        setValue(FieldLoc, *Val);
+      Visited.erase(FieldType.getCanonicalType());
+    }
+  };
+
+  for (const auto &[Field, FieldLoc] : Loc.children()) {
+    assert(Field != nullptr);
+    QualType FieldType = Field->getType();
+
+    if (FieldType->isReferenceType()) {
+      Loc.setChild(*Field,
+                   &createLocAndMaybeValue(FieldType, Visited, Depth + 1,
+                                           CreatedValuesCount));
+    } else {
+      assert(FieldLoc != nullptr);
+      initField(FieldType, *FieldLoc);
+    }
+  }
+  for (const auto &[FieldName, FieldLoc] : Loc.synthetic_fields()) {
+    assert(FieldLoc != nullptr);
+    QualType FieldType = FieldLoc->getType();
+
+    // Synthetic fields cannot have reference type, so we don't need to deal
+    // with this case.
+    assert(!FieldType->isReferenceType());
+    initField(FieldType, Loc.getSyntheticField(FieldName));
+  }
+}
+
 StorageLocation &Environment::createObjectInternal(const ValueDecl *D,
                                                    QualType Ty,
                                                    const Expr *InitExpr) {
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index bb3aec763c29..f0b15f43b1f4 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -535,7 +535,19 @@ public:
         return;
 
       copyRecord(*LocSrc, *LocDst, Env);
-      Env.setStorageLocation(*S, *LocDst);
+
+      // If the expr is a glvalue, we can reasonably assume the operator is
+      // returning T& and thus we can assign it `LocDst`.
+      if (S->isGLValue()) {
+        Env.setStorageLocation(*S, *LocDst);
+      } else if (S->getType()->isRecordType()) {
+        // Make sure that we have a `RecordValue` for this expression so that
+        // `Environment::getResultObjectLocation()` is able to return a location
+        // for it.
+        if (Env.getValue(*S) == nullptr)
+          refreshRecordValue(*S, Env);
+      }
+
       return;
     }
 
@@ -545,6 +557,10 @@ public:
     VisitCallExpr(S);
   }
 
+  void VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *RBO) {
+    propagateValue(*RBO->getSemanticForm(), *RBO, Env);
+  }
+
   void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *S) {
     if (S->getCastKind() == CK_ConstructorConversion) {
       const Expr *SubExpr = S->getSubExpr();
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index a6dcf16b928e..3c2a6fd81b1d 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -12,10 +12,14 @@
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Basic/CharInfo.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include <memory>
 #include <optional>
 #include <queue>
@@ -407,9 +411,6 @@ using DeclUseList = SmallVector<const DeclRefExpr *, 1>;
 
 // Convenience typedef.
 using FixItList = SmallVector<FixItHint, 4>;
-
-// Defined below.
-class Strategy;
 } // namespace
 
 namespace {
@@ -486,7 +487,7 @@ public:
   /// Returns a fixit that would fix the current gadget according to
   /// the current strategy. Returns std::nullopt if the fix cannot be produced;
   /// returns an empty list if no fixes are necessary.
-  virtual std::optional<FixItList> getFixits(const Strategy &) const {
+  virtual std::optional<FixItList> getFixits(const FixitStrategy &) const {
     return std::nullopt;
   }
 
@@ -737,7 +738,8 @@ public:
     return stmt(PtrInitStmt);
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override {
     // FIXME: This needs to be the entire DeclStmt, assuming that this method
@@ -789,7 +791,8 @@ public:
     return stmt(isInUnspecifiedUntypedContext(PtrAssignExpr));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override {
     // FIXME: This should be the binary operator, assuming that this method
@@ -892,7 +895,8 @@ public:
     return expr(isInUnspecifiedLvalueContext(Target));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -932,7 +936,8 @@ public:
     return stmt(isInUnspecifiedPointerContext(target));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -976,7 +981,8 @@ public:
 
   virtual const Stmt *getBaseStmt() const final { return Op; }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 };
 
 // Represents expressions of the form `&DRE[any]` in the Unspecified Pointer
@@ -1009,7 +1015,8 @@ public:
             .bind(UPCAddressofArraySubscriptTag)))));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1088,46 +1095,6 @@ public:
 };
 } // namespace
 
-namespace {
-// Strategy is a map from variables to the way we plan to emit fixes for
-// these variables. It is figured out gradually by trying different fixes
-// for different variables depending on gadgets in which these variables
-// participate.
-class Strategy {
-public:
-  enum class Kind {
-    Wontfix,  // We don't plan to emit a fixit for this variable.
-    Span,     // We recommend replacing the variable with std::span.
-    Iterator, // We recommend replacing the variable with std::span::iterator.
-    Array,    // We recommend replacing the variable with std::array.
-    Vector    // We recommend replacing the variable with std::vector.
-  };
-
-private:
-  using MapTy = llvm::DenseMap<const VarDecl *, Kind>;
-
-  MapTy Map;
-
-public:
-  Strategy() = default;
-  Strategy(const Strategy &) = delete; // Let's avoid copies.
-  Strategy &operator=(const Strategy &) = delete;
-  Strategy(Strategy &&) = default;
-  Strategy &operator=(Strategy &&) = default;
-
-  void set(const VarDecl *VD, Kind K) { Map[VD] = K; }
-
-  Kind lookup(const VarDecl *VD) const {
-    auto I = Map.find(VD);
-    if (I == Map.end())
-      return Kind::Wontfix;
-
-    return I->second;
-  }
-};
-} // namespace
-
-
 // Representing a pointer type expression of the form `++Ptr` in an Unspecified
 // Pointer Context (UPC):
 class UPCPreIncrementGadget : public FixableGadget {
@@ -1159,7 +1126,8 @@ public:
 										  ).bind(UPCPreIncrementTag)))));
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1204,7 +1172,8 @@ public:
     // clang-format on
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &S) const override;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &S) const override;
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
@@ -1254,7 +1223,8 @@ public:
     // clang-format on
   }
 
-  virtual std::optional<FixItList> getFixits(const Strategy &s) const final;
+  virtual std::optional<FixItList>
+  getFixits(const FixitStrategy &s) const final;
 
   // TODO remove this method from FixableGadget interface
   virtual const Stmt *getBaseStmt() const final { return nullptr; }
@@ -1464,38 +1434,40 @@ bool clang::internal::anyConflict(const SmallVectorImpl<FixItHint> &FixIts,
 }
 
 std::optional<FixItList>
-PointerAssignmentGadget::getFixits(const Strategy &S) const {
+PointerAssignmentGadget::getFixits(const FixitStrategy &S) const {
   const auto *LeftVD = cast<VarDecl>(PtrLHS->getDecl());
   const auto *RightVD = cast<VarDecl>(PtrRHS->getDecl());
   switch (S.lookup(LeftVD)) {
-    case Strategy::Kind::Span:
-      if (S.lookup(RightVD) == Strategy::Kind::Span)
-        return FixItList{};
-      return std::nullopt;
-    case Strategy::Kind::Wontfix:
-      return std::nullopt;
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
-      llvm_unreachable("unsupported strategies for FixableGadgets");
+  case FixitStrategy::Kind::Span:
+    if (S.lookup(RightVD) == FixitStrategy::Kind::Span)
+      return FixItList{};
+    return std::nullopt;
+  case FixitStrategy::Kind::Wontfix:
+    return std::nullopt;
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt;
 }
 
 std::optional<FixItList>
-PointerInitGadget::getFixits(const Strategy &S) const {
+PointerInitGadget::getFixits(const FixitStrategy &S) const {
   const auto *LeftVD = PtrInitLHS;
   const auto *RightVD = cast<VarDecl>(PtrInitRHS->getDecl());
   switch (S.lookup(LeftVD)) {
-    case Strategy::Kind::Span:
-      if (S.lookup(RightVD) == Strategy::Kind::Span)
-        return FixItList{};
-      return std::nullopt;
-    case Strategy::Kind::Wontfix:
-      return std::nullopt;
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
+  case FixitStrategy::Kind::Span:
+    if (S.lookup(RightVD) == FixitStrategy::Kind::Span)
+      return FixItList{};
+    return std::nullopt;
+  case FixitStrategy::Kind::Wontfix:
+    return std::nullopt;
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
     llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt;
@@ -1512,12 +1484,12 @@ static bool isNonNegativeIntegerExpr(const Expr *Expr, const VarDecl *VD,
 }
 
 std::optional<FixItList>
-ULCArraySubscriptGadget::getFixits(const Strategy &S) const {
+ULCArraySubscriptGadget::getFixits(const FixitStrategy &S) const {
   if (const auto *DRE =
           dyn_cast<DeclRefExpr>(Node->getBase()->IgnoreImpCasts()))
     if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
       switch (S.lookup(VD)) {
-      case Strategy::Kind::Span: {
+      case FixitStrategy::Kind::Span: {
 
         // If the index has a negative constant value, we give up as no valid
         // fix-it can be generated:
@@ -1528,10 +1500,11 @@ ULCArraySubscriptGadget::getFixits(const Strategy &S) const {
         // no-op is a good fix-it, otherwise
         return FixItList{};
       }
-      case Strategy::Kind::Wontfix:
-      case Strategy::Kind::Iterator:
-      case Strategy::Kind::Array:
-      case Strategy::Kind::Vector:
+      case FixitStrategy::Kind::Array:
+        return FixItList{};
+      case FixitStrategy::Kind::Wontfix:
+      case FixitStrategy::Kind::Iterator:
+      case FixitStrategy::Kind::Vector:
         llvm_unreachable("unsupported strategies for FixableGadgets");
       }
     }
@@ -1542,17 +1515,18 @@ static std::optional<FixItList> // forward declaration
 fixUPCAddressofArraySubscriptWithSpan(const UnaryOperator *Node);
 
 std::optional<FixItList>
-UPCAddressofArraySubscriptGadget::getFixits(const Strategy &S) const {
+UPCAddressofArraySubscriptGadget::getFixits(const FixitStrategy &S) const {
   auto DREs = getClaimedVarUseSites();
   const auto *VD = cast<VarDecl>(DREs.front()->getDecl());
 
   switch (S.lookup(VD)) {
-  case Strategy::Kind::Span:
+  case FixitStrategy::Kind::Span:
     return fixUPCAddressofArraySubscriptWithSpan(Node);
-  case Strategy::Kind::Wontfix:
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
+  case FixitStrategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
     llvm_unreachable("unsupported strategies for FixableGadgets");
   }
   return std::nullopt; // something went wrong, no fix-it
@@ -1803,10 +1777,10 @@ getSpanTypeText(StringRef EltTyText,
 }
 
 std::optional<FixItList>
-DerefSimplePtrArithFixableGadget::getFixits(const Strategy &s) const {
+DerefSimplePtrArithFixableGadget::getFixits(const FixitStrategy &s) const {
   const VarDecl *VD = dyn_cast<VarDecl>(BaseDeclRefExpr->getDecl());
 
-  if (VD && s.lookup(VD) == Strategy::Kind::Span) {
+  if (VD && s.lookup(VD) == FixitStrategy::Kind::Span) {
     ASTContext &Ctx = VD->getASTContext();
     // std::span can't represent elements before its begin()
     if (auto ConstVal = Offset->getIntegerConstantExpr(Ctx))
@@ -1866,10 +1840,10 @@ DerefSimplePtrArithFixableGadget::getFixits(const Strategy &s) const {
 }
 
 std::optional<FixItList>
-PointerDereferenceGadget::getFixits(const Strategy &S) const {
+PointerDereferenceGadget::getFixits(const FixitStrategy &S) const {
   const VarDecl *VD = cast<VarDecl>(BaseDeclRefExpr->getDecl());
   switch (S.lookup(VD)) {
-  case Strategy::Kind::Span: {
+  case FixitStrategy::Kind::Span: {
     ASTContext &Ctx = VD->getASTContext();
     SourceManager &SM = Ctx.getSourceManager();
     // Required changes: *(ptr); => (ptr[0]); and *ptr; => ptr[0]
@@ -1884,11 +1858,12 @@ PointerDereferenceGadget::getFixits(const Strategy &S) const {
     }
     break;
   }
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
-    llvm_unreachable("Strategy not implemented yet!");
-  case Strategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("FixitStrategy not implemented yet!");
+  case FixitStrategy::Kind::Wontfix:
     llvm_unreachable("Invalid strategy!");
   }
 
@@ -1897,28 +1872,28 @@ PointerDereferenceGadget::getFixits(const Strategy &S) const {
 
 // Generates fix-its replacing an expression of the form UPC(DRE) with
 // `DRE.data()`
-std::optional<FixItList> UPCStandalonePointerGadget::getFixits(const Strategy &S)
-      const {
+std::optional<FixItList>
+UPCStandalonePointerGadget::getFixits(const FixitStrategy &S) const {
   const auto VD = cast<VarDecl>(Node->getDecl());
   switch (S.lookup(VD)) {
-    case Strategy::Kind::Span: {
-      ASTContext &Ctx = VD->getASTContext();
-      SourceManager &SM = Ctx.getSourceManager();
-      // Inserts the .data() after the DRE
-      std::optional<SourceLocation> EndOfOperand =
-          getPastLoc(Node, SM, Ctx.getLangOpts());
-
-      if (EndOfOperand)
-        return FixItList{{FixItHint::CreateInsertion(
-            *EndOfOperand, ".data()")}};
-      // FIXME: Points inside a macro expansion.
-      break;
-    }
-    case Strategy::Kind::Wontfix:
-    case Strategy::Kind::Iterator:
-    case Strategy::Kind::Array:
-    case Strategy::Kind::Vector:
-      llvm_unreachable("unsupported strategies for FixableGadgets");
+  case FixitStrategy::Kind::Span: {
+    ASTContext &Ctx = VD->getASTContext();
+    SourceManager &SM = Ctx.getSourceManager();
+    // Inserts the .data() after the DRE
+    std::optional<SourceLocation> EndOfOperand =
+        getPastLoc(Node, SM, Ctx.getLangOpts());
+
+    if (EndOfOperand)
+      return FixItList{{FixItHint::CreateInsertion(*EndOfOperand, ".data()")}};
+    // FIXME: Points inside a macro expansion.
+    break;
+  }
+  case FixitStrategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Array:
+    return std::nullopt;
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("unsupported strategies for FixableGadgets");
   }
 
   return std::nullopt;
@@ -1962,14 +1937,14 @@ fixUPCAddressofArraySubscriptWithSpan(const UnaryOperator *Node) {
 }
 
 std::optional<FixItList>
-UUCAddAssignGadget::getFixits(const Strategy &S) const {
+UUCAddAssignGadget::getFixits(const FixitStrategy &S) const {
   DeclUseList DREs = getClaimedVarUseSites();
 
   if (DREs.size() != 1)
     return std::nullopt; // In cases of `Ptr += n` where `Ptr` is not a DRE, we
                          // give up
   if (const VarDecl *VD = dyn_cast<VarDecl>(DREs.front()->getDecl())) {
-    if (S.lookup(VD) == Strategy::Kind::Span) {
+    if (S.lookup(VD) == FixitStrategy::Kind::Span) {
       FixItList Fixes;
 
       const Stmt *AddAssignNode = getBaseStmt();
@@ -2003,14 +1978,15 @@ UUCAddAssignGadget::getFixits(const Strategy &S) const {
   return std::nullopt; // Not in the cases that we can handle for now, give up.
 }
 
-std::optional<FixItList> UPCPreIncrementGadget::getFixits(const Strategy &S) const {
+std::optional<FixItList>
+UPCPreIncrementGadget::getFixits(const FixitStrategy &S) const {
   DeclUseList DREs = getClaimedVarUseSites();
 
   if (DREs.size() != 1)
     return std::nullopt; // In cases of `++Ptr` where `Ptr` is not a DRE, we
                          // give up
   if (const VarDecl *VD = dyn_cast<VarDecl>(DREs.front()->getDecl())) {
-    if (S.lookup(VD) == Strategy::Kind::Span) {
+    if (S.lookup(VD) == FixitStrategy::Kind::Span) {
       FixItList Fixes;
       std::stringstream SS;
       const Stmt *PreIncNode = getBaseStmt();
@@ -2033,7 +2009,6 @@ std::optional<FixItList> UPCPreIncrementGadget::getFixits(const Strategy &S) con
   return std::nullopt; // Not in the cases that we can handle for now, give up.
 }
 
-
 // For a non-null initializer `Init` of `T *` type, this function returns
 // `FixItHint`s producing a list initializer `{Init,  S}` as a part of a fix-it
 // to output stream.
@@ -2261,7 +2236,7 @@ static bool hasConflictingOverload(const FunctionDecl *FD) {
 // }
 //
 static std::optional<FixItList>
-createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
+createOverloadsForFixedParams(const FixitStrategy &S, const FunctionDecl *FD,
                               const ASTContext &Ctx,
                               UnsafeBufferUsageHandler &Handler) {
   // FIXME: need to make this conflict checking better:
@@ -2278,9 +2253,9 @@ createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
   for (unsigned i = 0; i < NumParms; i++) {
     const ParmVarDecl *PVD = FD->getParamDecl(i);
 
-    if (S.lookup(PVD) == Strategy::Kind::Wontfix)
+    if (S.lookup(PVD) == FixitStrategy::Kind::Wontfix)
       continue;
-    if (S.lookup(PVD) != Strategy::Kind::Span)
+    if (S.lookup(PVD) != FixitStrategy::Kind::Span)
       // Not supported, not suppose to happen:
       return std::nullopt;
 
@@ -2291,7 +2266,8 @@ createOverloadsForFixedParams(const Strategy &S, const FunctionDecl *FD,
     if (!PteTyText)
       // something wrong in obtaining the text of the pointee type, give up
       return std::nullopt;
-    // FIXME: whether we should create std::span type depends on the Strategy.
+    // FIXME: whether we should create std::span type depends on the
+    // FixitStrategy.
     NewTysTexts[i] = getSpanTypeText(*PteTyText, PteTyQuals);
     ParmsMask[i] = true;
     AtLeastOneParmToFix = true;
@@ -2495,10 +2471,103 @@ static FixItList fixVariableWithSpan(const VarDecl *VD,
   return fixLocalVarDeclWithSpan(VD, Ctx, getUserFillPlaceHolder(), Handler);
 }
 
+static FixItList fixVarDeclWithArray(const VarDecl *D, const ASTContext &Ctx,
+                                     UnsafeBufferUsageHandler &Handler) {
+  FixItList FixIts{};
+
+  // Note: the code below expects the declaration to not use any type sugar like
+  // typedef.
+  if (auto CAT = dyn_cast<clang::ConstantArrayType>(D->getType())) {
+    const QualType &ArrayEltT = CAT->getElementType();
+    assert(!ArrayEltT.isNull() && "Trying to fix a non-array type variable!");
+    // FIXME: support multi-dimensional arrays
+    if (isa<clang::ArrayType>(ArrayEltT.getCanonicalType()))
+      return {};
+
+    const SourceLocation IdentifierLoc = getVarDeclIdentifierLoc(D);
+
+    // Get the spelling of the element type as written in the source file
+    // (including macros, etc.).
+    auto MaybeElemTypeTxt =
+        getRangeText({D->getBeginLoc(), IdentifierLoc}, Ctx.getSourceManager(),
+                     Ctx.getLangOpts());
+    if (!MaybeElemTypeTxt)
+      return {};
+    const llvm::StringRef ElemTypeTxt = MaybeElemTypeTxt->trim();
+
+    // Find the '[' token.
+    std::optional<Token> NextTok = Lexer::findNextToken(
+        IdentifierLoc, Ctx.getSourceManager(), Ctx.getLangOpts());
+    while (NextTok && !NextTok->is(tok::l_square) &&
+           NextTok->getLocation() <= D->getSourceRange().getEnd())
+      NextTok = Lexer::findNextToken(NextTok->getLocation(),
+                                     Ctx.getSourceManager(), Ctx.getLangOpts());
+    if (!NextTok)
+      return {};
+    const SourceLocation LSqBracketLoc = NextTok->getLocation();
+
+    // Get the spelling of the array size as written in the source file
+    // (including macros, etc.).
+    auto MaybeArraySizeTxt = getRangeText(
+        {LSqBracketLoc.getLocWithOffset(1), D->getTypeSpecEndLoc()},
+        Ctx.getSourceManager(), Ctx.getLangOpts());
+    if (!MaybeArraySizeTxt)
+      return {};
+    const llvm::StringRef ArraySizeTxt = MaybeArraySizeTxt->trim();
+    if (ArraySizeTxt.empty()) {
+      // FIXME: Support array size getting determined from the initializer.
+      // Examples:
+      //    int arr1[] = {0, 1, 2};
+      //    int arr2{3, 4, 5};
+      // We might be able to preserve the non-specified size with `auto` and
+      // `std::to_array`:
+      //    auto arr1 = std::to_array<int>({0, 1, 2});
+      return {};
+    }
+
+    std::optional<StringRef> IdentText =
+        getVarDeclIdentifierText(D, Ctx.getSourceManager(), Ctx.getLangOpts());
+
+    if (!IdentText) {
+      DEBUG_NOTE_DECL_FAIL(D, " : failed to locate the identifier");
+      return {};
+    }
+
+    SmallString<32> Replacement;
+    raw_svector_ostream OS(Replacement);
+    OS << "std::array<" << ElemTypeTxt << ", " << ArraySizeTxt << "> "
+       << IdentText->str();
+
+    FixIts.push_back(FixItHint::CreateReplacement(
+        SourceRange{D->getBeginLoc(), D->getTypeSpecEndLoc()}, OS.str()));
+  }
+
+  return FixIts;
+}
+
+static FixItList fixVariableWithArray(const VarDecl *VD,
+                                      const DeclUseTracker &Tracker,
+                                      const ASTContext &Ctx,
+                                      UnsafeBufferUsageHandler &Handler) {
+  const DeclStmt *DS = Tracker.lookupDecl(VD);
+  assert(DS && "Fixing non-local variables not implemented yet!");
+  if (!DS->isSingleDecl()) {
+    // FIXME: to support handling multiple `VarDecl`s in a single `DeclStmt`
+    return {};
+  }
+  // Currently DS is an unused variable but we'll need it when
+  // non-single decls are implemented, where the pointee type name
+  // and the '*' are spread around the place.
+  (void)DS;
+
+  // FIXME: handle cases where DS has multiple declarations
+  return fixVarDeclWithArray(VD, Ctx, Handler);
+}
+
 // TODO: we should be consistent to use `std::nullopt` to represent no-fix due
 // to any unexpected problem.
 static FixItList
-fixVariable(const VarDecl *VD, Strategy::Kind K,
+fixVariable(const VarDecl *VD, FixitStrategy::Kind K,
             /* The function decl under analysis */ const Decl *D,
             const DeclUseTracker &Tracker, ASTContext &Ctx,
             UnsafeBufferUsageHandler &Handler) {
@@ -2529,7 +2598,7 @@ fixVariable(const VarDecl *VD, Strategy::Kind K,
   }
 
   switch (K) {
-  case Strategy::Kind::Span: {
+  case FixitStrategy::Kind::Span: {
     if (VD->getType()->isPointerType()) {
       if (const auto *PVD = dyn_cast<ParmVarDecl>(VD))
         return fixParamWithSpan(PVD, Ctx, Handler);
@@ -2540,11 +2609,18 @@ fixVariable(const VarDecl *VD, Strategy::Kind K,
     DEBUG_NOTE_DECL_FAIL(VD, " : not a pointer");
     return {};
   }
-  case Strategy::Kind::Iterator:
-  case Strategy::Kind::Array:
-  case Strategy::Kind::Vector:
-    llvm_unreachable("Strategy not implemented yet!");
-  case Strategy::Kind::Wontfix:
+  case FixitStrategy::Kind::Array: {
+    if (VD->isLocalVarDecl() &&
+        isa<clang::ConstantArrayType>(VD->getType().getCanonicalType()))
+      return fixVariableWithArray(VD, Tracker, Ctx, Handler);
+
+    DEBUG_NOTE_DECL_FAIL(VD, " : not a local const-size array");
+    return {};
+  }
+  case FixitStrategy::Kind::Iterator:
+  case FixitStrategy::Kind::Vector:
+    llvm_unreachable("FixitStrategy not implemented yet!");
+  case FixitStrategy::Kind::Wontfix:
     llvm_unreachable("Invalid strategy!");
   }
   llvm_unreachable("Unknown strategy!");
@@ -2605,7 +2681,8 @@ static void eraseVarsForUnfixableGroupMates(
 static FixItList createFunctionOverloadsForParms(
     std::map<const VarDecl *, FixItList> &FixItsForVariable /* mutable */,
     const VariableGroupsManager &VarGrpMgr, const FunctionDecl *FD,
-    const Strategy &S, ASTContext &Ctx, UnsafeBufferUsageHandler &Handler) {
+    const FixitStrategy &S, ASTContext &Ctx,
+    UnsafeBufferUsageHandler &Handler) {
   FixItList FixItsSharedByParms{};
 
   std::optional<FixItList> OverloadFixes =
@@ -2625,8 +2702,8 @@ static FixItList createFunctionOverloadsForParms(
 
 // Constructs self-contained fix-its for each variable in `FixablesForAllVars`.
 static std::map<const VarDecl *, FixItList>
-getFixIts(FixableGadgetSets &FixablesForAllVars, const Strategy &S,
-	  ASTContext &Ctx,
+getFixIts(FixableGadgetSets &FixablesForAllVars, const FixitStrategy &S,
+          ASTContext &Ctx,
           /* The function decl under analysis */ const Decl *D,
           const DeclUseTracker &Tracker, UnsafeBufferUsageHandler &Handler,
           const VariableGroupsManager &VarGrpMgr) {
@@ -2724,11 +2801,14 @@ getFixIts(FixableGadgetSets &FixablesForAllVars, const Strategy &S,
 }
 
 template <typename VarDeclIterTy>
-static Strategy
+static FixitStrategy
 getNaiveStrategy(llvm::iterator_range<VarDeclIterTy> UnsafeVars) {
-  Strategy S;
+  FixitStrategy S;
   for (const VarDecl *VD : UnsafeVars) {
-    S.set(VD, Strategy::Kind::Span);
+    if (isa<ConstantArrayType>(VD->getType().getCanonicalType()))
+      S.set(VD, FixitStrategy::Kind::Array);
+    else
+      S.set(VD, FixitStrategy::Kind::Span);
   }
   return S;
 }
@@ -3034,7 +3114,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
 
   // We assign strategies to variables that are 1) in the graph and 2) can be
   // fixed. Other variables have the default "Won't fix" strategy.
-  Strategy NaiveStrategy = getNaiveStrategy(llvm::make_filter_range(
+  FixitStrategy NaiveStrategy = getNaiveStrategy(llvm::make_filter_range(
       VisitedVars, [&FixablesForAllVars](const VarDecl *V) {
         // If a warned variable has no "Fixable", it is considered unfixable:
         return FixablesForAllVars.byVar.count(V);
@@ -3057,9 +3137,9 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
     auto FixItsIt = FixItsForVariableGroup.find(VD);
     Handler.handleUnsafeVariableGroup(VD, VarGrpMgr,
                                       FixItsIt != FixItsForVariableGroup.end()
-                                      ? std::move(FixItsIt->second)
-                                      : FixItList{},
-                                      D);
+                                          ? std::move(FixItsIt->second)
+                                          : FixItList{},
+                                      D, NaiveStrategy);
     for (const auto &G : WarningGadgets) {
       Handler.handleUnsafeOperation(G->getBaseStmt(), /*IsRelatedToDecl=*/true,
                                     D->getASTContext());
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 6c7bd50eefb7..b353a6627f29 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -100,7 +100,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 };
 
 // Diagnostic classes.
-enum {
+enum DiagnosticClass {
   CLASS_NOTE       = 0x01,
   CLASS_REMARK     = 0x02,
   CLASS_WARNING    = 0x03,
@@ -110,15 +110,22 @@ enum {
 
 struct StaticDiagInfoRec {
   uint16_t DiagID;
+  LLVM_PREFERRED_TYPE(diag::Severity)
   uint8_t DefaultSeverity : 3;
+  LLVM_PREFERRED_TYPE(DiagnosticClass)
   uint8_t Class : 3;
+  LLVM_PREFERRED_TYPE(DiagnosticIDs::SFINAEResponse)
   uint8_t SFINAE : 2;
   uint8_t Category : 6;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnNoWerror : 1;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnShowInSystemHeader : 1;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t WarnShowInSystemMacro : 1;
 
   uint16_t OptionGroupIndex : 15;
+  LLVM_PREFERRED_TYPE(bool)
   uint16_t Deferrable : 1;
 
   uint16_t DescriptionLen;
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index 925217431d4d..1c5043a618ff 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -376,7 +376,7 @@ Module *Module::findOrInferSubmodule(StringRef Name) {
 
 Module *Module::getGlobalModuleFragment() const {
   assert(isNamedModuleUnit() && "We should only query the global module "
-                                "fragment from the C++ 20 Named modules");
+                                "fragment from the C++20 Named modules");
 
   for (auto *SubModule : SubModules)
     if (SubModule->isExplicitGlobalModule())
@@ -387,7 +387,7 @@ Module *Module::getGlobalModuleFragment() const {
 
 Module *Module::getPrivateModuleFragment() const {
   assert(isNamedModuleUnit() && "We should only query the private module "
-                                "fragment from the C++ 20 Named modules");
+                                "fragment from the C++20 Named modules");
 
   for (auto *SubModule : SubModules)
     if (SubModule->isPrivateModule())
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 68032961451d..dd0218e6ebed 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -367,8 +367,20 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // ACLE predefines. Many can only have one possible value on v8 AArch64.
   Builder.defineMacro("__ARM_ACLE", "200");
-  Builder.defineMacro("__ARM_ARCH",
-                      std::to_string(ArchInfo->Version.getMajor()));
+
+  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA.
+  // For ISAs up to and including v8, __ARM_ARCH is equal to the major version
+  // number. For ISAs from v8.1 onwards, __ARM_ARCH is scaled up to include the
+  // minor version number, e.g. for ARM architecture ARMvX.Y:
+  // __ARM_ARCH = X * 100 + Y.
+  if (ArchInfo->Version.getMajor() == 8 && ArchInfo->Version.getMinor() == 0)
+    Builder.defineMacro("__ARM_ARCH",
+                        std::to_string(ArchInfo->Version.getMajor()));
+  else
+    Builder.defineMacro("__ARM_ARCH",
+                        std::to_string(ArchInfo->Version.getMajor() * 100 +
+                                       ArchInfo->Version.getMinor().value()));
+
   Builder.defineMacro("__ARM_ARCH_PROFILE",
                       std::string("'") + (char)ArchInfo->Profile + "'");
 
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 141501e8a4d9..10cba6b7eac5 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/SmallString.h"
 using namespace clang;
 using namespace clang::targets;
 
@@ -279,13 +280,25 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (GPUKind == llvm::AMDGPU::GK_NONE && !IsHIPHost)
     return;
 
-  StringRef CanonName = isAMDGCN(getTriple()) ? getArchNameAMDGCN(GPUKind)
-                                              : getArchNameR600(GPUKind);
+  llvm::SmallString<16> CanonName =
+      (isAMDGCN(getTriple()) ? getArchNameAMDGCN(GPUKind)
+                             : getArchNameR600(GPUKind));
+
+  // Sanitize the name of generic targets.
+  // e.g. gfx10.1-generic -> gfx10_1_generic
+  if (GPUKind >= llvm::AMDGPU::GK_AMDGCN_GENERIC_FIRST &&
+      GPUKind <= llvm::AMDGPU::GK_AMDGCN_GENERIC_LAST) {
+    std::replace(CanonName.begin(), CanonName.end(), '.', '_');
+    std::replace(CanonName.begin(), CanonName.end(), '-', '_');
+  }
+
   Builder.defineMacro(Twine("__") + Twine(CanonName) + Twine("__"));
   // Emit macros for gfx family e.g. gfx906 -> __GFX9__, gfx1030 -> __GFX10___
   if (isAMDGCN(getTriple()) && !IsHIPHost) {
-    assert(CanonName.starts_with("gfx") && "Invalid amdgcn canonical name");
-    Builder.defineMacro(Twine("__") + Twine(CanonName.drop_back(2).upper()) +
+    assert(StringRef(CanonName).starts_with("gfx") &&
+           "Invalid amdgcn canonical name");
+    StringRef CanonFamilyName = getArchFamilyNameAMDGCN(GPUKind);
+    Builder.defineMacro(Twine("__") + Twine(CanonFamilyName.upper()) +
                         Twine("__"));
     Builder.defineMacro("__amdgcn_processor__",
                         Twine("\"") + Twine(CanonName) + Twine("\""));
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 55b71557452f..cd7fb95259d9 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -130,6 +130,7 @@ void ARMTargetInfo::setArchInfo(llvm::ARM::ArchKind Kind) {
   SubArch = llvm::ARM::getSubArch(ArchKind);
   ArchProfile = llvm::ARM::parseArchProfile(SubArch);
   ArchVersion = llvm::ARM::parseArchVersion(SubArch);
+  ArchMinorVersion = llvm::ARM::parseArchMinorVersion(SubArch);
 
   // cache CPU related strings
   CPUAttr = getCPUAttr();
@@ -736,9 +737,16 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (!CPUAttr.empty())
     Builder.defineMacro("__ARM_ARCH_" + CPUAttr + "__");
 
-  // ACLE 6.4.1 ARM/Thumb instruction set architecture
-  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA
-  Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion));
+  // __ARM_ARCH is defined as an integer value indicating the current ARM ISA.
+  // For ISAs up to and including v8, __ARM_ARCH is equal to the major version
+  // number. For ISAs from v8.1 onwards, __ARM_ARCH is scaled up to include the
+  // minor version number, e.g. for ARM architecture ARMvX.Y:
+  // __ARM_ARCH = X * 100 + Y.
+  if (ArchVersion >= 9 || ArchMinorVersion != 0)
+    Builder.defineMacro("__ARM_ARCH",
+                        Twine(ArchVersion * 100 + ArchMinorVersion));
+  else
+    Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion));
 
   if (ArchVersion >= 8) {
     // ACLE 6.5.7 Crypto Extension
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index 9802eb01abf3..df06e4d12063 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -60,27 +60,45 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
   llvm::ARM::ArchKind ArchKind = llvm::ARM::ArchKind::ARMV4T;
   llvm::ARM::ProfileKind ArchProfile;
   unsigned ArchVersion;
+  unsigned ArchMinorVersion;
 
+  LLVM_PREFERRED_TYPE(FPUMode)
   unsigned FPU : 5;
+  LLVM_PREFERRED_TYPE(MVEMode)
   unsigned MVE : 2;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsAAPCS : 1;
+  LLVM_PREFERRED_TYPE(HWDivMode)
   unsigned HWDiv : 2;
 
   // Initialized via features.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SoftFloat : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SoftFloatABI : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned CRC : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Crypto : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SHA2 : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned AES : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DSP : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Unaligned : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned DotProd : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasMatMul : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FPRegsDisabled : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasPAC : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned HasBTI : 1;
 
   enum {
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 837a6e799e3a..a6d4af2b8811 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -234,7 +234,7 @@ static constexpr Builtin::Info BuiltinInfo[] = {
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#include "clang/Basic/BuiltinsRISCV.def"
+#include "clang/Basic/BuiltinsRISCV.inc"
 };
 
 ArrayRef<Builtin::Info> RISCVTargetInfo::getTargetBuiltins() const {
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index e6235f394a6a..e25991e3dfe8 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -310,6 +310,7 @@ public:
     assert(Triple.getEnvironment() >= llvm::Triple::Pixel &&
            Triple.getEnvironment() <= llvm::Triple::Amplification &&
            "Logical SPIR-V environment must be a valid shader stage.");
+    PointerWidth = PointerAlign = 64;
 
     // SPIR-V IDs are represented with a single 32-bit word.
     SizeType = TargetInfo::UnsignedInt;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index e2511aa1b5c9..a31082524023 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -748,7 +748,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
         CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName()
                                                : CodeGenOpts.InstrProfileOutput,
         "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr,
-        PGOOptions::NoCSAction, CodeGenOpts.DebugInfoForProfiling,
+        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling,
         /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate);
   else if (CodeGenOpts.hasProfileIRUse()) {
     // -fprofile-use.
@@ -757,28 +758,32 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     PGOOpt = PGOOptions(
         CodeGenOpts.ProfileInstrumentUsePath, "",
         CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS,
-        PGOOptions::IRUse, CSAction, CodeGenOpts.DebugInfoForProfiling);
+        PGOOptions::IRUse, CSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling);
   } else if (!CodeGenOpts.SampleProfileFile.empty())
     // -fprofile-sample-use
     PGOOpt = PGOOptions(
         CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile,
         CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse,
-        PGOOptions::NoCSAction, CodeGenOpts.DebugInfoForProfiling,
-        CodeGenOpts.PseudoProbeForProfiling);
+        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling);
   else if (!CodeGenOpts.MemoryProfileUsePath.empty())
     // -fmemory-profile-use (without any of the above options)
     PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default,
                         CodeGenOpts.DebugInfoForProfiling);
   else if (CodeGenOpts.PseudoProbeForProfiling)
     // -fpseudo-probe-for-profiling
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default,
                         CodeGenOpts.DebugInfoForProfiling, true);
   else if (CodeGenOpts.DebugInfoForProfiling)
     // -fdebug-info-for-profiling
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
-                        PGOOptions::NoAction, PGOOptions::NoCSAction, true);
+                        PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
 
   // Check to see if we want to generate a CS profile.
   if (CodeGenOpts.hasProfileCSIRInstr()) {
@@ -801,7 +806,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
                          ? getDefaultProfileGenName()
                          : CodeGenOpts.InstrProfileOutput,
                      "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction,
-                     PGOOptions::CSIRInstr, CodeGenOpts.DebugInfoForProfiling);
+                     PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default,
+                     CodeGenOpts.DebugInfoForProfiling);
   }
   if (TM)
     TM->setPGOOption(PGOOpt);
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 52e6ddb7d6af..a8d846b4f6a5 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -811,29 +811,6 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
   Builder.SetInsertPoint(ContBB);
 }
 
-static void
-AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
-                  bool UseOptimizedLibcall, llvm::Value *Val, QualType ValTy,
-                  SourceLocation Loc, CharUnits SizeInChars) {
-  if (UseOptimizedLibcall) {
-    // Load value and pass it to the function directly.
-    CharUnits Align = CGF.getContext().getTypeAlignInChars(ValTy);
-    int64_t SizeInBits = CGF.getContext().toBits(SizeInChars);
-    ValTy =
-        CGF.getContext().getIntTypeForBitwidth(SizeInBits, /*Signed=*/false);
-    llvm::Type *ITy = llvm::IntegerType::get(CGF.getLLVMContext(), SizeInBits);
-    Address Ptr = Address(Val, ITy, Align);
-    Val = CGF.EmitLoadOfScalar(Ptr, false,
-                               CGF.getContext().getPointerType(ValTy),
-                               Loc);
-    // Coerce the value into an appropriately sized integer type.
-    Args.add(RValue::get(Val), ValTy);
-  } else {
-    // Non-optimized functions always take a reference.
-    Args.add(RValue::get(Val), CGF.getContext().VoidPtrTy);
-  }
-}
-
 RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
   QualType MemTy = AtomicTy;
@@ -857,22 +834,16 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   uint64_t Size = TInfo.Width.getQuantity();
   unsigned MaxInlineWidthInBits = getTarget().getMaxAtomicInlineWidth();
 
-  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
-  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
-  bool UseLibcall = Misaligned | Oversized;
-  bool ShouldCastToIntPtrTy = true;
-
   CharUnits MaxInlineWidth =
       getContext().toCharUnitsFromBits(MaxInlineWidthInBits);
-
   DiagnosticsEngine &Diags = CGM.getDiags();
-
+  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
+  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
   if (Misaligned) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_misaligned)
         << (int)TInfo.Width.getQuantity()
         << (int)Ptr.getAlignment().getQuantity();
   }
-
   if (Oversized) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_oversized)
         << (int)TInfo.Width.getQuantity() << (int)MaxInlineWidth.getQuantity();
@@ -881,6 +852,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   llvm::Value *Order = EmitScalarExpr(E->getOrder());
   llvm::Value *Scope =
       E->getScopeModel() ? EmitScalarExpr(E->getScope()) : nullptr;
+  bool ShouldCastToIntPtrTy = true;
 
   switch (E->getOp()) {
   case AtomicExpr::AO__c11_atomic_init:
@@ -1047,122 +1019,25 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Dest = Atomics.castToAtomicIntPointer(Dest);
   }
 
-  // Use a library call.  See: http://gcc.gnu.org/wiki/Atomic/GCCMM/LIbrary .
+  bool PowerOf2Size = (Size & (Size - 1)) == 0;
+  bool UseLibcall = !PowerOf2Size || (Size > 16);
+
+  // For atomics larger than 16 bytes, emit a libcall from the frontend. This
+  // avoids the overhead of dealing with excessively-large value types in IR.
+  // Non-power-of-2 values also lower to libcall here, as they are not currently
+  // permitted in IR instructions (although that constraint could be relaxed in
+  // the future). For other cases where a libcall is required on a given
+  // platform, we let the backend handle it (this includes handling for all of
+  // the size-optimized libcall variants, which are only valid up to 16 bytes.)
+  //
+  // See: https://llvm.org/docs/Atomics.html#libcalls-atomic
   if (UseLibcall) {
-    bool UseOptimizedLibcall = false;
-    switch (E->getOp()) {
-    case AtomicExpr::AO__c11_atomic_init:
-    case AtomicExpr::AO__opencl_atomic_init:
-      llvm_unreachable("Already handled above with EmitAtomicInit!");
-
-    case AtomicExpr::AO__atomic_fetch_add:
-    case AtomicExpr::AO__atomic_fetch_and:
-    case AtomicExpr::AO__atomic_fetch_max:
-    case AtomicExpr::AO__atomic_fetch_min:
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__atomic_fetch_or:
-    case AtomicExpr::AO__atomic_fetch_sub:
-    case AtomicExpr::AO__atomic_fetch_xor:
-    case AtomicExpr::AO__atomic_add_fetch:
-    case AtomicExpr::AO__atomic_and_fetch:
-    case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__atomic_or_fetch:
-    case AtomicExpr::AO__atomic_sub_fetch:
-    case AtomicExpr::AO__atomic_xor_fetch:
-    case AtomicExpr::AO__c11_atomic_fetch_add:
-    case AtomicExpr::AO__c11_atomic_fetch_and:
-    case AtomicExpr::AO__c11_atomic_fetch_max:
-    case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_or:
-    case AtomicExpr::AO__c11_atomic_fetch_sub:
-    case AtomicExpr::AO__c11_atomic_fetch_xor:
-    case AtomicExpr::AO__hip_atomic_fetch_add:
-    case AtomicExpr::AO__hip_atomic_fetch_and:
-    case AtomicExpr::AO__hip_atomic_fetch_max:
-    case AtomicExpr::AO__hip_atomic_fetch_min:
-    case AtomicExpr::AO__hip_atomic_fetch_or:
-    case AtomicExpr::AO__hip_atomic_fetch_sub:
-    case AtomicExpr::AO__hip_atomic_fetch_xor:
-    case AtomicExpr::AO__opencl_atomic_fetch_add:
-    case AtomicExpr::AO__opencl_atomic_fetch_and:
-    case AtomicExpr::AO__opencl_atomic_fetch_max:
-    case AtomicExpr::AO__opencl_atomic_fetch_min:
-    case AtomicExpr::AO__opencl_atomic_fetch_or:
-    case AtomicExpr::AO__opencl_atomic_fetch_sub:
-    case AtomicExpr::AO__opencl_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_fetch_add:
-    case AtomicExpr::AO__scoped_atomic_fetch_and:
-    case AtomicExpr::AO__scoped_atomic_fetch_max:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_or:
-    case AtomicExpr::AO__scoped_atomic_fetch_sub:
-    case AtomicExpr::AO__scoped_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_add_fetch:
-    case AtomicExpr::AO__scoped_atomic_and_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_or_fetch:
-    case AtomicExpr::AO__scoped_atomic_sub_fetch:
-    case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      // For these, only library calls for certain sizes exist.
-      UseOptimizedLibcall = true;
-      break;
-
-    case AtomicExpr::AO__atomic_load:
-    case AtomicExpr::AO__atomic_store:
-    case AtomicExpr::AO__atomic_exchange:
-    case AtomicExpr::AO__atomic_compare_exchange:
-    case AtomicExpr::AO__scoped_atomic_load:
-    case AtomicExpr::AO__scoped_atomic_store:
-    case AtomicExpr::AO__scoped_atomic_exchange:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange:
-      // Use the generic version if we don't know that the operand will be
-      // suitably aligned for the optimized version.
-      if (Misaligned)
-        break;
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_load_n:
-    case AtomicExpr::AO__atomic_store_n:
-    case AtomicExpr::AO__atomic_exchange_n:
-    case AtomicExpr::AO__atomic_compare_exchange_n:
-    case AtomicExpr::AO__c11_atomic_load:
-    case AtomicExpr::AO__c11_atomic_store:
-    case AtomicExpr::AO__c11_atomic_exchange:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__hip_atomic_load:
-    case AtomicExpr::AO__hip_atomic_store:
-    case AtomicExpr::AO__hip_atomic_exchange:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__opencl_atomic_load:
-    case AtomicExpr::AO__opencl_atomic_store:
-    case AtomicExpr::AO__opencl_atomic_exchange:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__scoped_atomic_load_n:
-    case AtomicExpr::AO__scoped_atomic_store_n:
-    case AtomicExpr::AO__scoped_atomic_exchange_n:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange_n:
-      // Only use optimized library calls for sizes for which they exist.
-      // FIXME: Size == 16 optimized library functions exist too.
-      if (Size == 1 || Size == 2 || Size == 4 || Size == 8)
-        UseOptimizedLibcall = true;
-      break;
-    }
-
     CallArgList Args;
-    if (!UseOptimizedLibcall) {
-      // For non-optimized library calls, the size is the first parameter
-      Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
-               getContext().getSizeType());
-    }
-    // Atomic address is the first or second parameter
+    // For non-optimized library calls, the size is the first parameter.
+    Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
+             getContext().getSizeType());
+
+    // The atomic address is the second parameter.
     // The OpenCL atomic library functions only accept pointer arguments to
     // generic address space.
     auto CastToGenericAddrSpace = [&](llvm::Value *V, QualType PT) {
@@ -1177,18 +1052,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       return getTargetHooks().performAddrSpaceCast(
           *this, V, AS, LangAS::opencl_generic, DestType, false);
     };
-
     Args.add(RValue::get(CastToGenericAddrSpace(Ptr.getPointer(),
                                                 E->getPtr()->getType())),
              getContext().VoidPtrTy);
 
+    // The next 1-3 parameters are op-dependent.
     std::string LibCallName;
-    QualType LoweredMemTy =
-      MemTy->isPointerType() ? getContext().getIntPtrType() : MemTy;
     QualType RetTy;
     bool HaveRetTy = false;
-    llvm::Instruction::BinaryOps PostOp = (llvm::Instruction::BinaryOps)0;
-    bool PostOpMinMax = false;
     switch (E->getOp()) {
     case AtomicExpr::AO__c11_atomic_init:
     case AtomicExpr::AO__opencl_atomic_init:
@@ -1199,8 +1070,6 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     // and exchange.
     // bool __atomic_compare_exchange(size_t size, void *mem, void *expected,
     //                                void *desired, int success, int failure)
-    // bool __atomic_compare_exchange_N(T *mem, T *expected, T desired,
-    //                                  int success, int failure)
     case AtomicExpr::AO__atomic_compare_exchange:
     case AtomicExpr::AO__atomic_compare_exchange_n:
     case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
@@ -1217,14 +1086,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
                                                   E->getVal1()->getType())),
                getContext().VoidPtrTy);
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val2.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val2.getPointer(),
+                                                  E->getVal2()->getType())),
+               getContext().VoidPtrTy);
       Args.add(RValue::get(Order), getContext().IntTy);
       Order = OrderFail;
       break;
     // void __atomic_exchange(size_t size, void *mem, void *val, void *return,
     //                        int order)
-    // T __atomic_exchange_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_exchange:
     case AtomicExpr::AO__atomic_exchange_n:
     case AtomicExpr::AO__c11_atomic_exchange:
@@ -1233,11 +1102,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_exchange:
     case AtomicExpr::AO__scoped_atomic_exchange_n:
       LibCallName = "__atomic_exchange";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_store(size_t size, void *mem, void *val, int order)
-    // void __atomic_store_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_store:
     case AtomicExpr::AO__atomic_store_n:
     case AtomicExpr::AO__c11_atomic_store:
@@ -1248,11 +1117,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       LibCallName = "__atomic_store";
       RetTy = getContext().VoidTy;
       HaveRetTy = true;
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_load(size_t size, void *mem, void *return, int order)
-    // T __atomic_load_N(T *mem, int order)
     case AtomicExpr::AO__atomic_load:
     case AtomicExpr::AO__atomic_load_n:
     case AtomicExpr::AO__c11_atomic_load:
@@ -1262,183 +1131,85 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_load_n:
       LibCallName = "__atomic_load";
       break;
-    // T __atomic_add_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_add_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_add_fetch:
     case AtomicExpr::AO__scoped_atomic_add_fetch:
-      PostOp = llvm::Instruction::Add;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_add:
     case AtomicExpr::AO__c11_atomic_fetch_add:
     case AtomicExpr::AO__hip_atomic_fetch_add:
     case AtomicExpr::AO__opencl_atomic_fetch_add:
     case AtomicExpr::AO__scoped_atomic_fetch_add:
-      LibCallName = "__atomic_fetch_add";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_and_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_and_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_and_fetch:
     case AtomicExpr::AO__scoped_atomic_and_fetch:
-      PostOp = llvm::Instruction::And;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_and:
     case AtomicExpr::AO__c11_atomic_fetch_and:
     case AtomicExpr::AO__hip_atomic_fetch_and:
     case AtomicExpr::AO__opencl_atomic_fetch_and:
     case AtomicExpr::AO__scoped_atomic_fetch_and:
-      LibCallName = "__atomic_fetch_and";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_or_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_or_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_or_fetch:
     case AtomicExpr::AO__scoped_atomic_or_fetch:
-      PostOp = llvm::Instruction::Or;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_or:
     case AtomicExpr::AO__c11_atomic_fetch_or:
     case AtomicExpr::AO__hip_atomic_fetch_or:
     case AtomicExpr::AO__opencl_atomic_fetch_or:
     case AtomicExpr::AO__scoped_atomic_fetch_or:
-      LibCallName = "__atomic_fetch_or";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_sub_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_sub_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_sub_fetch:
     case AtomicExpr::AO__scoped_atomic_sub_fetch:
-      PostOp = llvm::Instruction::Sub;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_sub:
     case AtomicExpr::AO__c11_atomic_fetch_sub:
     case AtomicExpr::AO__hip_atomic_fetch_sub:
     case AtomicExpr::AO__opencl_atomic_fetch_sub:
     case AtomicExpr::AO__scoped_atomic_fetch_sub:
-      LibCallName = "__atomic_fetch_sub";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_xor_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_xor_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_xor_fetch:
     case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      PostOp = llvm::Instruction::Xor;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_xor:
     case AtomicExpr::AO__c11_atomic_fetch_xor:
     case AtomicExpr::AO__hip_atomic_fetch_xor:
     case AtomicExpr::AO__opencl_atomic_fetch_xor:
     case AtomicExpr::AO__scoped_atomic_fetch_xor:
-      LibCallName = "__atomic_fetch_xor";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__atomic_nand_fetch:
+    case AtomicExpr::AO__atomic_fetch_nand:
+    case AtomicExpr::AO__c11_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_nand_fetch:
     case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_min:
     case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
     case AtomicExpr::AO__hip_atomic_fetch_min:
     case AtomicExpr::AO__opencl_atomic_fetch_min:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_min"
-                        : "__atomic_fetch_umin";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_fetch_min:
+    case AtomicExpr::AO__scoped_atomic_min_fetch:
     case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_max:
     case AtomicExpr::AO__c11_atomic_fetch_max:
     case AtomicExpr::AO__hip_atomic_fetch_max:
     case AtomicExpr::AO__opencl_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_fetch_max:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_max"
-                        : "__atomic_fetch_umax";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_nand_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_nand_N(T *mem, T val, int order)
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-      PostOp = llvm::Instruction::And; // the NOT is special cased below
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-      LibCallName = "__atomic_fetch_nand";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_max_fetch:
+      llvm_unreachable("Integral atomic operations always become atomicrmw!");
     }
 
     if (E->isOpenCL()) {
-      LibCallName = std::string("__opencl") +
-          StringRef(LibCallName).drop_front(1).str();
-
+      LibCallName =
+          std::string("__opencl") + StringRef(LibCallName).drop_front(1).str();
     }
-    // Optimized functions have the size in their name.
-    if (UseOptimizedLibcall)
-      LibCallName += "_" + llvm::utostr(Size);
     // By default, assume we return a value of the atomic type.
     if (!HaveRetTy) {
-      if (UseOptimizedLibcall) {
-        // Value is returned directly.
-        // The function returns an appropriately sized integer type.
-        RetTy = getContext().getIntTypeForBitwidth(
-            getContext().toBits(TInfo.Width), /*Signed=*/false);
-      } else {
-        // Value is returned through parameter before the order.
-        RetTy = getContext().VoidTy;
-        Args.add(RValue::get(Dest.getPointer()), getContext().VoidPtrTy);
-      }
+      // Value is returned through parameter before the order.
+      RetTy = getContext().VoidTy;
+      Args.add(RValue::get(CastToGenericAddrSpace(Dest.getPointer(), RetTy)),
+               getContext().VoidPtrTy);
     }
-    // order is always the last parameter
+    // Order is always the last parameter.
     Args.add(RValue::get(Order),
              getContext().IntTy);
     if (E->isOpenCL())
       Args.add(RValue::get(Scope), getContext().IntTy);
 
-    // PostOp is only needed for the atomic_*_fetch operations, and
-    // thus is only needed for and implemented in the
-    // UseOptimizedLibcall codepath.
-    assert(UseOptimizedLibcall || (!PostOp && !PostOpMinMax));
-
     RValue Res = emitAtomicLibcall(*this, LibCallName, RetTy, Args);
     // The value is returned directly from the libcall.
     if (E->isCmpXChg())
       return Res;
 
-    // The value is returned directly for optimized libcalls but the expr
-    // provided an out-param.
-    if (UseOptimizedLibcall && Res.getScalarVal()) {
-      llvm::Value *ResVal = Res.getScalarVal();
-      if (PostOpMinMax) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = EmitPostAtomicMinMax(Builder, E->getOp(),
-                                      E->getValueType()->isSignedIntegerType(),
-                                      ResVal, LoadVal1);
-      } else if (PostOp) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = Builder.CreateBinOp(PostOp, ResVal, LoadVal1);
-      }
-      if (E->getOp() == AtomicExpr::AO__atomic_nand_fetch ||
-          E->getOp() == AtomicExpr::AO__scoped_atomic_nand_fetch)
-        ResVal = Builder.CreateNot(ResVal);
-
-      Builder.CreateStore(ResVal, Dest.withElementType(ResVal->getType()));
-    }
-
     if (RValTy->isVoidType())
       return RValue::get(nullptr);
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e051cbc64863..ee0b75047696 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3443,6 +3443,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
     return RValue::get(Builder.CreateCall(F));
   }
+  case Builtin::BI__builtin_readsteadycounter: {
+    Function *F = CGM.getIntrinsic(Intrinsic::readsteadycounter);
+    return RValue::get(Builder.CreateCall(F));
+  }
   case Builtin::BI__builtin___clear_cache: {
     Value *Begin = EmitScalarExpr(E->getArg(0));
     Value *End = EmitScalarExpr(E->getArg(1));
@@ -5908,7 +5912,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           }
         }
 
-        assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
+        assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
                "Must be able to losslessly bit cast to param");
         // Cast vector type (e.g., v256i32) to x86_amx, this only happen
         // in amx intrinsics.
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index c7af8f1cf0fe..8030d632cc3d 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -54,10 +54,15 @@ public:
     };
 
   private:
+    LLVM_PREFERRED_TYPE(DeviceVarKind)
     unsigned Kind : 2;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Extern : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Constant : 1;   // Constant variable.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Managed : 1;    // Managed variable.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned Normalized : 1; // Normalized texture.
     int SurfTexType;         // Type of surface/texutre.
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index cd26a3df7860..d05cf1c6e181 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1301,27 +1301,25 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
   // If coercing a fixed vector to a scalable vector for ABI compatibility, and
   // the types match, use the llvm.vector.insert intrinsic to perform the
   // conversion.
-  if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
-    if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+  if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(Ty)) {
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      // If we are casting a fixed i8 vector to a scalable i1 predicate
       // vector, use a vector insert and bitcast the result.
-      bool NeedsBitcast = false;
-      auto PredType =
-          llvm::ScalableVectorType::get(CGF.Builder.getInt1Ty(), 16);
-      llvm::Type *OrigType = Ty;
-      if (ScalableDst == PredType &&
-          FixedSrc->getElementType() == CGF.Builder.getInt8Ty()) {
-        ScalableDst = llvm::ScalableVectorType::get(CGF.Builder.getInt8Ty(), 2);
-        NeedsBitcast = true;
+      if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+          ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+          FixedSrcTy->getElementType()->isIntegerTy(8)) {
+        ScalableDstTy = llvm::ScalableVectorType::get(
+            FixedSrcTy->getElementType(),
+            ScalableDstTy->getElementCount().getKnownMinValue() / 8);
       }
-      if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
+      if (ScalableDstTy->getElementType() == FixedSrcTy->getElementType()) {
         auto *Load = CGF.Builder.CreateLoad(Src);
-        auto *UndefVec = llvm::UndefValue::get(ScalableDst);
+        auto *UndefVec = llvm::UndefValue::get(ScalableDstTy);
         auto *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
         llvm::Value *Result = CGF.Builder.CreateInsertVector(
-            ScalableDst, UndefVec, Load, Zero, "cast.scalable");
-        if (NeedsBitcast)
-          Result = CGF.Builder.CreateBitCast(Result, OrigType);
+            ScalableDstTy, UndefVec, Load, Zero, "cast.scalable");
+        if (ScalableDstTy != Ty)
+          Result = CGF.Builder.CreateBitCast(Result, Ty);
         return Result;
       }
     }
@@ -3199,13 +3197,14 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         llvm::Value *Coerced = Fn->getArg(FirstIRArg);
         if (auto *VecTyFrom =
                 dyn_cast<llvm::ScalableVectorType>(Coerced->getType())) {
-          // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+          // If we are casting a scalable i1 predicate vector to a fixed i8
           // vector, bitcast the source and use a vector extract.
-          auto PredType =
-              llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-          if (VecTyFrom == PredType &&
+          if (VecTyFrom->getElementType()->isIntegerTy(1) &&
+              VecTyFrom->getElementCount().isKnownMultipleOf(8) &&
               VecTyTo->getElementType() == Builder.getInt8Ty()) {
-            VecTyFrom = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
+            VecTyFrom = llvm::ScalableVectorType::get(
+                VecTyTo->getElementType(),
+                VecTyFrom->getElementCount().getKnownMinValue() / 8);
             Coerced = Builder.CreateBitCast(Coerced, VecTyFrom);
           }
           if (VecTyFrom->getElementType() == VecTyTo->getElementType()) {
@@ -5877,12 +5876,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       // If coercing a fixed vector from a scalable vector for ABI
       // compatibility, and the types match, use the llvm.vector.extract
       // intrinsic to perform the conversion.
-      if (auto *FixedDst = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
         llvm::Value *V = CI;
-        if (auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(V->getType())) {
-          if (FixedDst->getElementType() == ScalableSrc->getElementType()) {
+        if (auto *ScalableSrcTy =
+                dyn_cast<llvm::ScalableVectorType>(V->getType())) {
+          if (FixedDstTy->getElementType() == ScalableSrcTy->getElementType()) {
             llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
-            V = Builder.CreateExtractVector(FixedDst, V, Zero, "cast.fixed");
+            V = Builder.CreateExtractVector(FixedDstTy, V, Zero, "cast.fixed");
             return RValue::get(V);
           }
         }
diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index 1c0d15dc932a..1bd48a072593 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -357,8 +357,11 @@ class ReturnValueSlot {
   Address Addr = Address::invalid();
 
   // Return value slot flags
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsVolatile : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsUnused : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsExternallyDestructed : 1;
 
 public:
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index fcfbf41b0eaf..7a7344c07160 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -40,6 +40,10 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+public:
+  enum Kind { Cleanup, Catch, Terminate, Filter };
+
+private:
   llvm::BasicBlock *CachedLandingPad;
   llvm::BasicBlock *CachedEHDispatchBlock;
 
@@ -47,6 +51,7 @@ class EHScope {
 
   class CommonBitFields {
     friend class EHScope;
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned Kind : 3;
   };
   enum { NumCommonBits = 3 };
@@ -64,21 +69,27 @@ protected:
     unsigned : NumCommonBits;
 
     /// Whether this cleanup needs to be run along normal edges.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsNormalCleanup : 1;
 
     /// Whether this cleanup needs to be run along exception edges.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsEHCleanup : 1;
 
     /// Whether this cleanup is currently active.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsActive : 1;
 
     /// Whether this cleanup is a lifetime marker
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsLifetimeMarker : 1;
 
     /// Whether the normal cleanup should test the activation flag.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned TestFlagInNormalCleanup : 1;
 
     /// Whether the EH cleanup should test the activation flag.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned TestFlagInEHCleanup : 1;
 
     /// The amount of extra storage needed by the Cleanup.
@@ -101,8 +112,6 @@ protected:
   };
 
 public:
-  enum Kind { Cleanup, Catch, Terminate, Filter };
-
   EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
     : CachedLandingPad(nullptr), CachedEHDispatchBlock(nullptr),
       EnclosingEHScope(enclosingEHScope) {
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index d136bfc37278..2adbef6d5512 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1423,6 +1423,7 @@ namespace {
     };
 
     unsigned NumPlacementArgs : 31;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned PassAlignmentToPlacementDelete : 1;
     const FunctionDecl *OperatorDelete;
     ValueTy Ptr;
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index df8f71cf1d90..aa805f291d17 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2137,26 +2137,24 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a fixed vector and Dst is a scalable vector, and both have the
     // same element type, use the llvm.vector.insert intrinsic to perform the
     // bitcast.
-    if (const auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
-      if (const auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
-        // If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
+    if (auto *FixedSrcTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
+      if (auto *ScalableDstTy = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
+        // If we are casting a fixed i8 vector to a scalable i1 predicate
         // vector, use a vector insert and bitcast the result.
-        bool NeedsBitCast = false;
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-        llvm::Type *OrigType = DstTy;
-        if (ScalableDst == PredType &&
-            FixedSrc->getElementType() == Builder.getInt8Ty()) {
-          DstTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
-          ScalableDst = cast<llvm::ScalableVectorType>(DstTy);
-          NeedsBitCast = true;
+        if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
+            ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedSrcTy->getElementType()->isIntegerTy(8)) {
+          ScalableDstTy = llvm::ScalableVectorType::get(
+              FixedSrcTy->getElementType(),
+              ScalableDstTy->getElementCount().getKnownMinValue() / 8);
         }
-        if (FixedSrc->getElementType() == ScalableDst->getElementType()) {
-          llvm::Value *UndefVec = llvm::UndefValue::get(DstTy);
+        if (FixedSrcTy->getElementType() == ScalableDstTy->getElementType()) {
+          llvm::Value *UndefVec = llvm::UndefValue::get(ScalableDstTy);
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           llvm::Value *Result = Builder.CreateInsertVector(
-              DstTy, UndefVec, Src, Zero, "cast.scalable");
-          if (NeedsBitCast)
-            Result = Builder.CreateBitCast(Result, OrigType);
+              ScalableDstTy, UndefVec, Src, Zero, "cast.scalable");
+          if (Result->getType() != DstTy)
+            Result = Builder.CreateBitCast(Result, DstTy);
           return Result;
         }
       }
@@ -2165,18 +2163,19 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     // If Src is a scalable vector and Dst is a fixed vector, and both have the
     // same element type, use the llvm.vector.extract intrinsic to perform the
     // bitcast.
-    if (const auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
-      if (const auto *FixedDst = dyn_cast<llvm::FixedVectorType>(DstTy)) {
-        // If we are casting a scalable 16 x i1 predicate vector to a fixed i8
+    if (auto *ScalableSrcTy = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(DstTy)) {
+        // If we are casting a scalable i1 predicate vector to a fixed i8
         // vector, bitcast the source and use a vector extract.
-        auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
-        if (ScalableSrc == PredType &&
-            FixedDst->getElementType() == Builder.getInt8Ty()) {
-          SrcTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
-          ScalableSrc = cast<llvm::ScalableVectorType>(SrcTy);
-          Src = Builder.CreateBitCast(Src, SrcTy);
+        if (ScalableSrcTy->getElementType()->isIntegerTy(1) &&
+            ScalableSrcTy->getElementCount().isKnownMultipleOf(8) &&
+            FixedDstTy->getElementType()->isIntegerTy(8)) {
+          ScalableSrcTy = llvm::ScalableVectorType::get(
+              FixedDstTy->getElementType(),
+              ScalableSrcTy->getElementCount().getKnownMinValue() / 8);
+          Src = Builder.CreateBitCast(Src, ScalableSrcTy);
         }
-        if (ScalableSrc->getElementType() == FixedDst->getElementType()) {
+        if (ScalableSrcTy->getElementType() == FixedDstTy->getElementType()) {
           llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
           return Builder.CreateExtractVector(DstTy, Src, Zero, "cast.fixed");
         }
@@ -4168,7 +4167,7 @@ Value *ScalarExprEmitter::EmitShl(const BinOpInfo &Ops) {
   bool SanitizeBase = SanitizeSignedBase || SanitizeUnsignedBase;
   bool SanitizeExponent = CGF.SanOpts.has(SanitizerKind::ShiftExponent);
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (CGF.getLangOpts().OpenCL)
+  if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL)
     RHS = ConstrainShiftValue(Ops.LHS, RHS, "shl.mask");
   else if ((SanitizeBase || SanitizeExponent) &&
            isa<llvm::IntegerType>(Ops.LHS->getType())) {
@@ -4237,7 +4236,7 @@ Value *ScalarExprEmitter::EmitShr(const BinOpInfo &Ops) {
     RHS = Builder.CreateIntCast(RHS, Ops.LHS->getType(), false, "sh_prom");
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (CGF.getLangOpts().OpenCL)
+  if (CGF.getLangOpts().OpenCL || CGF.getLangOpts().HLSL)
     RHS = ConstrainShiftValue(Ops.LHS, RHS, "shr.mask");
   else if (CGF.SanOpts.has(SanitizerKind::ShiftExponent) &&
            isa<llvm::IntegerType>(Ops.LHS->getType())) {
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 03fc0ec7ff54..f3a948cf13f9 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -899,9 +899,13 @@ namespace {
                          const ObjCPropertyImplDecl *propImpl);
 
   private:
+    LLVM_PREFERRED_TYPE(StrategyKind)
     unsigned Kind : 8;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsAtomic : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsCopy : 1;
+    LLVM_PREFERRED_TYPE(bool)
     unsigned HasStrong : 1;
 
     CharUnits IvarSize;
diff --git a/clang/lib/CodeGen/CGRecordLayout.h b/clang/lib/CodeGen/CGRecordLayout.h
index d5ea74922603..6c06ad20fbe5 100644
--- a/clang/lib/CodeGen/CGRecordLayout.h
+++ b/clang/lib/CodeGen/CGRecordLayout.h
@@ -71,6 +71,7 @@ struct CGBitFieldInfo {
   unsigned Size : 15;
 
   /// Whether the bit-field is signed.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsSigned : 1;
 
   /// The storage size in bits which should be used when accessing this
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index beff0ad9da27..af51875782c9 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -435,6 +435,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPParallelMaskedDirectiveClass:
     EmitOMPParallelMaskedDirective(cast<OMPParallelMaskedDirective>(*S));
     break;
+  case Stmt::OpenACCComputeConstructClass:
+    EmitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*S));
+    break;
   }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 143ad64e8816..caa6a327550b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -26,6 +26,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/ExprOpenMP.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/ABI.h"
@@ -203,6 +204,7 @@ template <> struct DominatingValue<RValue> {
 
     llvm::Value *Value;
     llvm::Type *ElementType;
+    LLVM_PREFERRED_TYPE(Kind)
     unsigned K : 3;
     unsigned Align : 29;
     saved_type(llvm::Value *v, llvm::Type *e, Kind k, unsigned a = 0)
@@ -650,9 +652,11 @@ public:
   struct LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
     unsigned Size;
-    /// The kind of cleanup to push: a value from the CleanupKind enumeration.
+    /// The kind of cleanup to push.
+    LLVM_PREFERRED_TYPE(CleanupKind)
     unsigned Kind : 31;
     /// Whether this is a conditional cleanup.
+    LLVM_PREFERRED_TYPE(bool)
     unsigned IsConditional : 1;
 
     size_t getSize() const { return Size; }
@@ -3837,6 +3841,15 @@ private:
   void EmitSections(const OMPExecutableDirective &S);
 
 public:
+  //===--------------------------------------------------------------------===//
+  //                         OpenACC Emission
+  //===--------------------------------------------------------------------===//
+  void EmitOpenACCComputeConstruct(const OpenACCComputeConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // simply emitting its structured block, but in the future we will implement
+    // some sort of IR.
+    EmitStmt(S.getStructuredBlock());
+  }
 
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 36b63d78b06f..2f923d5457f9 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -30,6 +30,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
@@ -4212,7 +4213,8 @@ void CodeGenModule::emitMultiVersionFunctions() {
     llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD);
     if (auto *IFunc = dyn_cast<llvm::GlobalIFunc>(ResolverConstant)) {
       ResolverConstant = IFunc->getResolver();
-      if (FD->isTargetClonesMultiVersion()) {
+      if (FD->isTargetClonesMultiVersion() ||
+          FD->isTargetVersionMultiVersion()) {
         const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
         llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI);
         std::string MangledName = getMangledNameImpl(
@@ -4393,8 +4395,18 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   // a separate resolver).
   std::string ResolverName = MangledName;
   if (getTarget().supportsIFunc()) {
-    if (!FD->isTargetClonesMultiVersion())
+    switch (FD->getMultiVersionKind()) {
+    case MultiVersionKind::None:
+      llvm_unreachable("unexpected MultiVersionKind::None for resolver");
+    case MultiVersionKind::Target:
+    case MultiVersionKind::CPUSpecific:
+    case MultiVersionKind::CPUDispatch:
       ResolverName += ".ifunc";
+      break;
+    case MultiVersionKind::TargetClones:
+    case MultiVersionKind::TargetVersion:
+      break;
+    }
   } else if (FD->isTargetMultiVersion()) {
     ResolverName += ".resolver";
   }
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 0c43317642bc..93fe76eb9903 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -95,9 +95,6 @@ void CoverageSourceInfo::updateNextTokLoc(SourceLocation Loc) {
 }
 
 namespace {
-using MCDCConditionID = CounterMappingRegion::MCDCConditionID;
-using MCDCParameters = CounterMappingRegion::MCDCParameters;
-
 /// A region of source code that can be mapped to a counter.
 class SourceMappingRegion {
   /// Primary Counter that is also used for Branch Regions for "True" branches.
@@ -107,7 +104,7 @@ class SourceMappingRegion {
   std::optional<Counter> FalseCount;
 
   /// Parameters used for Modified Condition/Decision Coverage
-  MCDCParameters MCDCParams;
+  mcdc::Parameters MCDCParams;
 
   /// The region's starting location.
   std::optional<SourceLocation> LocStart;
@@ -131,7 +128,7 @@ public:
         SkippedRegion(false) {}
 
   SourceMappingRegion(Counter Count, std::optional<Counter> FalseCount,
-                      MCDCParameters MCDCParams,
+                      mcdc::Parameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd,
                       bool GapRegion = false)
@@ -139,7 +136,7 @@ public:
         LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion),
         SkippedRegion(false) {}
 
-  SourceMappingRegion(MCDCParameters MCDCParams,
+  SourceMappingRegion(mcdc::Parameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd)
       : MCDCParams(MCDCParams), LocStart(LocStart), LocEnd(LocEnd),
@@ -185,9 +182,19 @@ public:
 
   bool isBranch() const { return FalseCount.has_value(); }
 
-  bool isMCDCDecision() const { return MCDCParams.NumConditions != 0; }
+  bool isMCDCDecision() const {
+    const auto *DecisionParams =
+        std::get_if<mcdc::DecisionParameters>(&MCDCParams);
+    assert(!DecisionParams || DecisionParams->NumConditions > 0);
+    return DecisionParams;
+  }
+
+  const auto &getMCDCDecisionParams() const {
+    return CounterMappingRegion::getParams<const mcdc::DecisionParameters>(
+        MCDCParams);
+  }
 
-  const MCDCParameters &getMCDCParams() const { return MCDCParams; }
+  const mcdc::Parameters &getMCDCParams() const { return MCDCParams; }
 };
 
 /// Spelling locations for the start and end of a source region.
@@ -483,13 +490,13 @@ public:
             SR.ColumnEnd));
       } else if (Region.isBranch()) {
         MappingRegions.push_back(CounterMappingRegion::makeBranchRegion(
-            Region.getCounter(), Region.getFalseCounter(),
-            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
-            SR.LineEnd, SR.ColumnEnd));
+            Region.getCounter(), Region.getFalseCounter(), *CovFileID,
+            SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd,
+            Region.getMCDCParams()));
       } else if (Region.isMCDCDecision()) {
         MappingRegions.push_back(CounterMappingRegion::makeDecisionRegion(
-            Region.getMCDCParams(), *CovFileID, SR.LineStart, SR.ColumnStart,
-            SR.LineEnd, SR.ColumnEnd));
+            Region.getMCDCDecisionParams(), *CovFileID, SR.LineStart,
+            SR.ColumnStart, SR.LineEnd, SR.ColumnEnd));
       } else {
         MappingRegions.push_back(CounterMappingRegion::makeRegion(
             Region.getCounter(), *CovFileID, SR.LineStart, SR.ColumnStart,
@@ -587,8 +594,8 @@ struct EmptyCoverageMappingBuilder : public CoverageMappingBuilder {
 struct MCDCCoverageBuilder {
 
   struct DecisionIDPair {
-    MCDCConditionID TrueID = 0;
-    MCDCConditionID FalseID = 0;
+    mcdc::ConditionID TrueID = 0;
+    mcdc::ConditionID FalseID = 0;
   };
 
   /// The AST walk recursively visits nested logical-AND or logical-OR binary
@@ -682,9 +689,9 @@ private:
   CodeGenModule &CGM;
 
   llvm::SmallVector<DecisionIDPair> DecisionStack;
-  llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDs;
+  llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDs;
   llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap;
-  MCDCConditionID NextID = 1;
+  mcdc::ConditionID NextID = 1;
   bool NotMapped = false;
 
   /// Represent a sentinel value of [0,0] for the bottom of DecisionStack.
@@ -696,9 +703,10 @@ private:
   }
 
 public:
-  MCDCCoverageBuilder(CodeGenModule &CGM,
-                      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
-                      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap)
+  MCDCCoverageBuilder(
+      CodeGenModule &CGM,
+      llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDMap,
+      llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap)
       : CGM(CGM), DecisionStack(1, DecisionStackSentinel), CondIDs(CondIDMap),
         MCDCBitmapMap(MCDCBitmapMap) {}
 
@@ -713,12 +721,12 @@ public:
   bool isBuilding() const { return (NextID > 1); }
 
   /// Set the given condition's ID.
-  void setCondID(const Expr *Cond, MCDCConditionID ID) {
+  void setCondID(const Expr *Cond, mcdc::ConditionID ID) {
     CondIDs[CodeGenFunction::stripCond(Cond)] = ID;
   }
 
   /// Return the ID of a given condition.
-  MCDCConditionID getCondID(const Expr *Cond) const {
+  mcdc::ConditionID getCondID(const Expr *Cond) const {
     auto I = CondIDs.find(CodeGenFunction::stripCond(Cond));
     if (I == CondIDs.end())
       return 0;
@@ -755,7 +763,7 @@ public:
       setCondID(E->getLHS(), NextID++);
 
     // Assign a ID+1 for the RHS.
-    MCDCConditionID RHSid = NextID++;
+    mcdc::ConditionID RHSid = NextID++;
     setCondID(E->getRHS(), RHSid);
 
     // Push the LHS decision IDs onto the DecisionStack.
@@ -865,8 +873,7 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt,
                     std::optional<Counter> FalseCount = std::nullopt,
-                    MCDCConditionID ID = 0, MCDCConditionID TrueID = 0,
-                    MCDCConditionID FalseID = 0) {
+                    const mcdc::Parameters &BranchParams = std::monostate()) {
 
     if (StartLoc && !FalseCount) {
       MostRecentLocation = *StartLoc;
@@ -885,9 +892,7 @@ struct CounterCoverageMappingBuilder
       StartLoc = std::nullopt;
     if (EndLoc && EndLoc->isInvalid())
       EndLoc = std::nullopt;
-    RegionStack.emplace_back(Count, FalseCount,
-                             MCDCParameters{0, 0, ID, TrueID, FalseID},
-                             StartLoc, EndLoc);
+    RegionStack.emplace_back(Count, FalseCount, BranchParams, StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -896,8 +901,8 @@ struct CounterCoverageMappingBuilder
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt) {
 
-    RegionStack.emplace_back(MCDCParameters{BitmapIdx, Conditions}, StartLoc,
-                             EndLoc);
+    RegionStack.emplace_back(mcdc::DecisionParameters{BitmapIdx, Conditions},
+                             StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -1042,9 +1047,11 @@ struct CounterCoverageMappingBuilder
     // function's SourceRegions) because it doesn't apply to any other source
     // code other than the Condition.
     if (CodeGenFunction::isInstrumentedCondition(C)) {
-      MCDCConditionID ID = MCDCBuilder.getCondID(C);
-      MCDCConditionID TrueID = IDPair.TrueID;
-      MCDCConditionID FalseID = IDPair.FalseID;
+      mcdc::Parameters BranchParams;
+      mcdc::ConditionID ID = MCDCBuilder.getCondID(C);
+      if (ID > 0)
+        BranchParams =
+            mcdc::BranchParameters{ID, IDPair.TrueID, IDPair.FalseID};
 
       // If a condition can fold to true or false, the corresponding branch
       // will be removed.  Create a region with both counters hard-coded to
@@ -1054,11 +1061,11 @@ struct CounterCoverageMappingBuilder
       // CodeGenFunction.c always returns false, but that is very heavy-handed.
       if (ConditionFoldsToBool(C))
         popRegions(pushRegion(Counter::getZero(), getStart(C), getEnd(C),
-                              Counter::getZero(), ID, TrueID, FalseID));
+                              Counter::getZero(), BranchParams));
       else
         // Otherwise, create a region with the True counter and False counter.
-        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt, ID,
-                              TrueID, FalseID));
+        popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt,
+                              BranchParams));
     }
   }
 
@@ -1149,12 +1156,9 @@ struct CounterCoverageMappingBuilder
         // we've seen this region.
         if (StartLocs.insert(Loc).second) {
           if (I.isBranch())
-            SourceRegions.emplace_back(
-                I.getCounter(), I.getFalseCounter(),
-                MCDCParameters{0, 0, I.getMCDCParams().ID,
-                               I.getMCDCParams().TrueID,
-                               I.getMCDCParams().FalseID},
-                Loc, getEndOfFileOrMacro(Loc), I.isBranch());
+            SourceRegions.emplace_back(I.getCounter(), I.getFalseCounter(),
+                                       I.getMCDCParams(), Loc,
+                                       getEndOfFileOrMacro(Loc), I.isBranch());
           else
             SourceRegions.emplace_back(I.getCounter(), Loc,
                                        getEndOfFileOrMacro(Loc));
@@ -1338,7 +1342,7 @@ struct CounterCoverageMappingBuilder
       CoverageMappingModuleGen &CVM,
       llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
       llvm::DenseMap<const Stmt *, unsigned> &MCDCBitmapMap,
-      llvm::DenseMap<const Stmt *, MCDCConditionID> &CondIDMap,
+      llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDMap,
       SourceManager &SM, const LangOptions &LangOpts)
       : CoverageMappingBuilder(CVM, SM, LangOpts), CounterMap(CounterMap),
         MCDCBitmapMap(MCDCBitmapMap),
@@ -2120,9 +2124,10 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
     OS << "File " << R.FileID << ", " << R.LineStart << ":" << R.ColumnStart
        << " -> " << R.LineEnd << ":" << R.ColumnEnd << " = ";
 
-    if (R.Kind == CounterMappingRegion::MCDCDecisionRegion) {
-      OS << "M:" << R.MCDCParams.BitmapIdx;
-      OS << ", C:" << R.MCDCParams.NumConditions;
+    if (const auto *DecisionParams =
+            std::get_if<mcdc::DecisionParameters>(&R.MCDCParams)) {
+      OS << "M:" << DecisionParams->BitmapIdx;
+      OS << ", C:" << DecisionParams->NumConditions;
     } else {
       Ctx.dump(R.Count, OS);
 
@@ -2133,9 +2138,10 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName,
       }
     }
 
-    if (R.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      OS << " [" << R.MCDCParams.ID << "," << R.MCDCParams.TrueID;
-      OS << "," << R.MCDCParams.FalseID << "] ";
+    if (const auto *BranchParams =
+            std::get_if<mcdc::BranchParameters>(&R.MCDCParams)) {
+      OS << " [" << BranchParams->ID << "," << BranchParams->TrueID;
+      OS << "," << BranchParams->FalseID << "] ";
     }
 
     if (R.Kind == CounterMappingRegion::ExpansionRegion)
diff --git a/clang/lib/CodeGen/MacroPPCallbacks.cpp b/clang/lib/CodeGen/MacroPPCallbacks.cpp
index 8589869f6e2f..c5d1e3ad5a20 100644
--- a/clang/lib/CodeGen/MacroPPCallbacks.cpp
+++ b/clang/lib/CodeGen/MacroPPCallbacks.cpp
@@ -168,8 +168,8 @@ void MacroPPCallbacks::FileChanged(SourceLocation Loc, FileChangeReason Reason,
 void MacroPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
 
   // Record the line location of the current included file.
   LastHashLoc = HashLoc;
diff --git a/clang/lib/CodeGen/MacroPPCallbacks.h b/clang/lib/CodeGen/MacroPPCallbacks.h
index 5af177d0c3fa..5f468648da04 100644
--- a/clang/lib/CodeGen/MacroPPCallbacks.h
+++ b/clang/lib/CodeGen/MacroPPCallbacks.h
@@ -102,7 +102,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   /// Hook called whenever a macro definition is seen.
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 22e583021515..ae1a4ba78826 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -178,4 +178,85 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
     else
       Features.push_back("-hard-quad-float");
   }
+
+  if (Args.hasArg(options::OPT_ffixed_g1))
+    Features.push_back("+reserve-g1");
+
+  if (Args.hasArg(options::OPT_ffixed_g2))
+    Features.push_back("+reserve-g2");
+
+  if (Args.hasArg(options::OPT_ffixed_g3))
+    Features.push_back("+reserve-g3");
+
+  if (Args.hasArg(options::OPT_ffixed_g4))
+    Features.push_back("+reserve-g4");
+
+  if (Args.hasArg(options::OPT_ffixed_g5))
+    Features.push_back("+reserve-g5");
+
+  if (Args.hasArg(options::OPT_ffixed_g6))
+    Features.push_back("+reserve-g6");
+
+  if (Args.hasArg(options::OPT_ffixed_g7))
+    Features.push_back("+reserve-g7");
+
+  if (Args.hasArg(options::OPT_ffixed_o0))
+    Features.push_back("+reserve-o0");
+
+  if (Args.hasArg(options::OPT_ffixed_o1))
+    Features.push_back("+reserve-o1");
+
+  if (Args.hasArg(options::OPT_ffixed_o2))
+    Features.push_back("+reserve-o2");
+
+  if (Args.hasArg(options::OPT_ffixed_o3))
+    Features.push_back("+reserve-o3");
+
+  if (Args.hasArg(options::OPT_ffixed_o4))
+    Features.push_back("+reserve-o4");
+
+  if (Args.hasArg(options::OPT_ffixed_o5))
+    Features.push_back("+reserve-o5");
+
+  if (Args.hasArg(options::OPT_ffixed_l0))
+    Features.push_back("+reserve-l0");
+
+  if (Args.hasArg(options::OPT_ffixed_l1))
+    Features.push_back("+reserve-l1");
+
+  if (Args.hasArg(options::OPT_ffixed_l2))
+    Features.push_back("+reserve-l2");
+
+  if (Args.hasArg(options::OPT_ffixed_l3))
+    Features.push_back("+reserve-l3");
+
+  if (Args.hasArg(options::OPT_ffixed_l4))
+    Features.push_back("+reserve-l4");
+
+  if (Args.hasArg(options::OPT_ffixed_l5))
+    Features.push_back("+reserve-l5");
+
+  if (Args.hasArg(options::OPT_ffixed_l6))
+    Features.push_back("+reserve-l6");
+
+  if (Args.hasArg(options::OPT_ffixed_l7))
+    Features.push_back("+reserve-l7");
+
+  if (Args.hasArg(options::OPT_ffixed_i0))
+    Features.push_back("+reserve-i0");
+
+  if (Args.hasArg(options::OPT_ffixed_i1))
+    Features.push_back("+reserve-i1");
+
+  if (Args.hasArg(options::OPT_ffixed_i2))
+    Features.push_back("+reserve-i2");
+
+  if (Args.hasArg(options::OPT_ffixed_i3))
+    Features.push_back("+reserve-i3");
+
+  if (Args.hasArg(options::OPT_ffixed_i4))
+    Features.push_back("+reserve-i4");
+
+  if (Args.hasArg(options::OPT_ffixed_i5))
+    Features.push_back("+reserve-i5");
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 942ebbc41060..4459d86e77d5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2778,6 +2778,26 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
   std::string ComplexRangeStr = "";
 
+  // Lambda to set fast-math options. This is also used by -ffp-model=fast
+  auto applyFastMath = [&]() {
+    HonorINFs = false;
+    HonorNaNs = false;
+    MathErrno = false;
+    AssociativeMath = true;
+    ReciprocalMath = true;
+    ApproxFunc = true;
+    SignedZeros = false;
+    TrappingMath = false;
+    RoundingFPMath = false;
+    FPExceptionBehavior = "";
+    // If fast-math is set then set the fp-contract mode to fast.
+    FPContract = "fast";
+    // ffast-math enables limited range rules for complex multiplication and
+    // division.
+    Range = LangOptions::ComplexRangeKind::CX_Limited;
+    SeenUnsafeMathModeOption = true;
+  };
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -2842,9 +2862,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
             << Args.MakeArgString("-ffp-model=" + FPModel)
             << Args.MakeArgString("-ffp-model=" + Val);
       if (Val.equals("fast")) {
-        optID = options::OPT_ffast_math;
         FPModel = Val;
-        FPContract = "fast";
+        applyFastMath();
       } else if (Val.equals("precise")) {
         optID = options::OPT_ffp_contract;
         FPModel = Val;
@@ -3061,22 +3080,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
         continue;
       [[fallthrough]];
     case options::OPT_ffast_math: {
-      HonorINFs = false;
-      HonorNaNs = false;
-      MathErrno = false;
-      AssociativeMath = true;
-      ReciprocalMath = true;
-      ApproxFunc = true;
-      SignedZeros = false;
-      TrappingMath = false;
-      RoundingFPMath = false;
-      FPExceptionBehavior = "";
-      // If fast-math is set then set the fp-contract mode to fast.
-      FPContract = "fast";
-      SeenUnsafeMathModeOption = true;
-      // ffast-math enables fortran rules for complex multiplication and
-      // division.
-      Range = LangOptions::ComplexRangeKind::CX_Limited;
+      applyFastMath();
       break;
     }
     case options::OPT_fno_fast_math:
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index fae8ad1a958a..cc1219d69d99 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1902,6 +1902,7 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver,
       "TVOS_DEPLOYMENT_TARGET",
       "WATCHOS_DEPLOYMENT_TARGET",
       "DRIVERKIT_DEPLOYMENT_TARGET",
+      "XROS_DEPLOYMENT_TARGET"
   };
   static_assert(std::size(EnvVars) == Darwin::LastDarwinPlatform + 1,
                 "Missing platform");
@@ -1914,14 +1915,15 @@ getDeploymentTargetFromEnvironmentVariables(const Driver &TheDriver,
   // default platform.
   if (!Targets[Darwin::MacOS].empty() &&
       (!Targets[Darwin::IPhoneOS].empty() ||
-       !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty())) {
+       !Targets[Darwin::WatchOS].empty() || !Targets[Darwin::TvOS].empty() ||
+       !Targets[Darwin::XROS].empty())) {
     if (Triple.getArch() == llvm::Triple::arm ||
         Triple.getArch() == llvm::Triple::aarch64 ||
         Triple.getArch() == llvm::Triple::thumb)
       Targets[Darwin::MacOS] = "";
     else
       Targets[Darwin::IPhoneOS] = Targets[Darwin::WatchOS] =
-          Targets[Darwin::TvOS] = "";
+          Targets[Darwin::TvOS] = Targets[Darwin::XROS] = "";
   } else {
     // Don't allow conflicts in any other platform.
     unsigned FirstTarget = std::size(Targets);
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 5e60b0841d6d..10d4b69e5d5f 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -300,7 +300,7 @@ public:
     WatchOS,
     DriverKit,
     XROS,
-    LastDarwinPlatform = DriverKit
+    LastDarwinPlatform = XROS
   };
   enum DarwinEnvironmentKind {
     NativeEnvironment,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 23da08aa593f..6168b42dc782 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -249,6 +249,20 @@ void Flang::AddRISCVTargetArgs(const ArgList &Args,
   }
 }
 
+void Flang::AddX86_64TargetArgs(const ArgList &Args,
+                                ArgStringList &CmdArgs) const {
+  if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
+    StringRef Value = A->getValue();
+    if (Value == "intel" || Value == "att") {
+      CmdArgs.push_back(Args.MakeArgString("-mllvm"));
+      CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
+    } else {
+      getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
+          << A->getSpelling() << Value;
+    }
+  }
+}
+
 static void addVSDefines(const ToolChain &TC, const ArgList &Args,
                          ArgStringList &CmdArgs) {
 
@@ -374,6 +388,7 @@ void Flang::addTargetOptions(const ArgList &Args,
     break;
   case llvm::Triple::x86_64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
+    AddX86_64TargetArgs(Args, CmdArgs);
     break;
   }
 
diff --git a/clang/lib/Driver/ToolChains/Flang.h b/clang/lib/Driver/ToolChains/Flang.h
index ec2e545a1d0b..9f5e26b86083 100644
--- a/clang/lib/Driver/ToolChains/Flang.h
+++ b/clang/lib/Driver/ToolChains/Flang.h
@@ -77,6 +77,13 @@ private:
   void AddRISCVTargetArgs(const llvm::opt::ArgList &Args,
                           llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// Add specific options for X86_64 target.
+  ///
+  /// \param [in] Args The list of input driver arguments
+  /// \param [out] CmdArgs The list of output command arguments
+  void AddX86_64TargetArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const;
+
   /// Extract offload options from the driver arguments and add them to
   /// the command arguments.
   /// \param [in] C The current compilation for the driver invocation
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 7fd04b23abdc..0b2ef97af44d 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -569,9 +569,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
           return true;
         }
       }
-      return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_No &&
-             (Style.AlwaysBreakTemplateDeclarations !=
-                  FormatStyle::BTDS_Leave ||
+      return Style.BreakTemplateDeclarations != FormatStyle::BTDS_No &&
+             (Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
               Current.NewlinesBefore > 0);
     }
     if (Previous.is(TT_FunctionAnnotationRParen) &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index c5714af155f0..8efc42e0576c 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -877,6 +877,10 @@ template <> struct MappingTraits<FormatStyle> {
     if (!IO.outputting()) {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
       IO.mapOptional("AllowAllConstructorInitializersOnNextLine", OnNextLine);
+      IO.mapOptional("AlwaysBreakAfterReturnType",
+                     Style.AlwaysBreakAfterReturnType);
+      IO.mapOptional("AlwaysBreakTemplateDeclarations",
+                     Style.BreakTemplateDeclarations);
       IO.mapOptional("BreakBeforeInheritanceComma",
                      BreakBeforeInheritanceComma);
       IO.mapOptional("BreakConstructorInitializersBeforeComma",
@@ -939,12 +943,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AllowShortLoopsOnASingleLine);
     IO.mapOptional("AlwaysBreakAfterDefinitionReturnType",
                    Style.AlwaysBreakAfterDefinitionReturnType);
-    IO.mapOptional("AlwaysBreakAfterReturnType",
-                   Style.AlwaysBreakAfterReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
                    Style.AlwaysBreakBeforeMultilineStrings);
-    IO.mapOptional("AlwaysBreakTemplateDeclarations",
-                   Style.AlwaysBreakTemplateDeclarations);
     IO.mapOptional("AttributeMacros", Style.AttributeMacros);
     IO.mapOptional("BinPackArguments", Style.BinPackArguments);
     IO.mapOptional("BinPackParameters", Style.BinPackParameters);
@@ -957,6 +957,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes);
     IO.mapOptional("BreakAfterJavaFieldAnnotations",
                    Style.BreakAfterJavaFieldAnnotations);
+    IO.mapOptional("BreakAfterReturnType", Style.AlwaysBreakAfterReturnType);
     IO.mapOptional("BreakArrays", Style.BreakArrays);
     IO.mapOptional("BreakBeforeBinaryOperators",
                    Style.BreakBeforeBinaryOperators);
@@ -971,6 +972,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.BreakConstructorInitializers);
     IO.mapOptional("BreakInheritanceList", Style.BreakInheritanceList);
     IO.mapOptional("BreakStringLiterals", Style.BreakStringLiterals);
+    IO.mapOptional("BreakTemplateDeclarations",
+                   Style.BreakTemplateDeclarations);
     IO.mapOptional("ColumnLimit", Style.ColumnLimit);
     IO.mapOptional("CommentPragmas", Style.CommentPragmas);
     IO.mapOptional("CompactNamespaces", Style.CompactNamespaces);
@@ -1439,7 +1442,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_None;
   LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None;
   LLVMStyle.AlwaysBreakBeforeMultilineStrings = false;
-  LLVMStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_MultiLine;
+  LLVMStyle.BreakTemplateDeclarations = FormatStyle::BTDS_MultiLine;
   LLVMStyle.AttributeMacros.push_back("__capability");
   LLVMStyle.BitFieldColonSpacing = FormatStyle::BFCS_Both;
   LLVMStyle.BinPackArguments = true;
@@ -1629,7 +1632,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
       FormatStyle::SIS_WithoutElse;
   GoogleStyle.AllowShortLoopsOnASingleLine = true;
   GoogleStyle.AlwaysBreakBeforeMultilineStrings = true;
-  GoogleStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^<ext/.*\\.h>", 2, 0, false},
                                                 {"^<.*\\.h>", 1, 0, false},
@@ -1822,7 +1825,7 @@ FormatStyle getMozillaStyle() {
   MozillaStyle.AlwaysBreakAfterReturnType = FormatStyle::RTBS_TopLevel;
   MozillaStyle.AlwaysBreakAfterDefinitionReturnType =
       FormatStyle::DRTBS_TopLevel;
-  MozillaStyle.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   MozillaStyle.BinPackParameters = false;
   MozillaStyle.BinPackArguments = false;
   MozillaStyle.BreakBeforeBraces = FormatStyle::BS_Mozilla;
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index bace91b5f99b..0c1dce7a2940 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -150,7 +150,17 @@ namespace format {
   TYPE(StructuredBindingLSquare)                                               \
   TYPE(TableGenBangOperator)                                                   \
   TYPE(TableGenCondOperator)                                                   \
+  TYPE(TableGenCondOperatorColon)                                              \
+  TYPE(TableGenCondOperatorComma)                                              \
+  TYPE(TableGenDAGArgCloser)                                                   \
+  TYPE(TableGenDAGArgListColon)                                                \
+  TYPE(TableGenDAGArgListComma)                                                \
+  TYPE(TableGenDAGArgOpener)                                                   \
+  TYPE(TableGenListCloser)                                                     \
+  TYPE(TableGenListOpener)                                                     \
   TYPE(TableGenMultiLineString)                                                \
+  TYPE(TableGenTrailingPasteOperator)                                          \
+  TYPE(TableGenValueSuffix)                                                    \
   TYPE(TemplateCloser)                                                         \
   TYPE(TemplateOpener)                                                         \
   TYPE(TemplateString)                                                         \
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a87d0ba3dbbf..492e7e96dd22 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -13,11 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTokenLexer.h"
-#include "FormatToken.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/Support/Regex.h"
+#include "TokenAnalyzer.h"
 
 namespace clang {
 namespace format {
@@ -28,12 +24,12 @@ FormatTokenLexer::FormatTokenLexer(
     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     IdentifierTable &IdentTable)
     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0),
-      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
       MacroBlockEndRegex(Style.MacroBlockEnd) {
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -816,7 +812,7 @@ void FormatTokenLexer::handleTableGenMultilineString() {
   auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
   if (CloseOffset == StringRef::npos)
     return;
-  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset + 2);
+  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
   MultiLineString->TokenText = Text;
   resetLexer(SourceMgr.getFileOffset(
       Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
@@ -1442,7 +1438,7 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {
 
 void FormatTokenLexer::resetLexer(unsigned Offset) {
   StringRef Buffer = SourceMgr.getBufferData(ID);
-  LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
   Lex->SetKeepWhitespaceMode(true);
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733bd533..ca91c5b7d20d 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,14 +17,9 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
 
 #include <stack>
 
@@ -120,7 +115,6 @@ private:
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
-  LangOptions LangOpts;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae32b11..3c2ceddd5599 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -79,7 +79,7 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env,
   AffectedRangeManager AffectedRangeMgr(SourceMgr, Env.getCharRanges());
 
   const auto ID = Env.getFileID();
-  const auto LangOpts = getFormattingLangOpts(Style);
+  assert(LangOpts.CPlusPlus);
   Lexer Lex(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts);
   Lex.SetCommentRetentionState(true);
 
diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp
index bd648c430f9b..f9d1fdb86f1a 100644
--- a/clang/lib/Format/TokenAnalyzer.cpp
+++ b/clang/lib/Format/TokenAnalyzer.cpp
@@ -35,6 +35,8 @@
 namespace clang {
 namespace format {
 
+LangOptions LangOpts;
+
 // FIXME: Instead of printing the diagnostic we should store it and have a
 // better way to return errors through the format APIs.
 class FatalDiagnosticConsumer : public DiagnosticConsumer {
@@ -99,9 +101,11 @@ TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
 
 std::pair<tooling::Replacements, unsigned>
 TokenAnalyzer::process(bool SkipAnnotation) {
+  LangOpts = getFormattingLangOpts(Style);
+
   tooling::Replacements Result;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  IdentifierTable IdentTable(getFormattingLangOpts(Style));
+  IdentifierTable IdentTable(LangOpts);
   FormatTokenLexer Lex(Env.getSourceManager(), Env.getFileID(),
                        Env.getFirstStartColumn(), Style, Encoding, Allocator,
                        IdentTable);
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c..18c1431eb376 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -34,6 +34,8 @@
 namespace clang {
 namespace format {
 
+extern LangOptions LangOpts;
+
 class Environment {
 public:
   // This sets up an virtual file system with file \p FileName containing the
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index cec56fad5315..b9a000faae7c 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -256,6 +256,18 @@ private:
           }
         }
       }
+      if (Style.isTableGen()) {
+        if (CurrentToken->isOneOf(tok::comma, tok::equal)) {
+          // They appear as separators. Unless they are not in class definition.
+          next();
+          continue;
+        }
+        // In angle, there must be Value like tokens. Types are also able to be
+        // parsed in the same way with Values.
+        if (!parseTableGenValue())
+          return false;
+        continue;
+      }
       if (!consumeToken())
         return false;
     }
@@ -388,6 +400,28 @@ private:
       Contexts.back().IsExpression = !IsForOrCatch;
     }
 
+    if (Style.isTableGen()) {
+      if (FormatToken *Prev = OpeningParen.Previous) {
+        if (Prev->is(TT_TableGenCondOperator)) {
+          Contexts.back().IsTableGenCondOpe = true;
+          Contexts.back().IsExpression = true;
+        } else if (Contexts.size() > 1 &&
+                   Contexts[Contexts.size() - 2].IsTableGenBangOpe) {
+          // Hack to handle bang operators. The parent context's flag
+          // was set by parseTableGenSimpleValue().
+          // We have to specify the context outside because the prev of "(" may
+          // be ">", not the bang operator in this case.
+          Contexts.back().IsTableGenBangOpe = true;
+          Contexts.back().IsExpression = true;
+        } else {
+          // Otherwise, this paren seems DAGArg.
+          if (!parseTableGenDAGArg())
+            return false;
+          return parseTableGenDAGArgAndList(&OpeningParen);
+        }
+      }
+    }
+
     // Infer the role of the l_paren based on the previous token if we haven't
     // detected one yet.
     if (PrevNonComment && OpeningParen.is(TT_Unknown)) {
@@ -549,6 +583,22 @@ private:
       if (CurrentToken->is(tok::comma))
         Contexts.back().CanBeExpression = true;
 
+      if (Style.isTableGen()) {
+        if (CurrentToken->is(tok::comma)) {
+          if (Contexts.back().IsTableGenCondOpe)
+            CurrentToken->setType(TT_TableGenCondOperatorComma);
+          next();
+        } else if (CurrentToken->is(tok::colon)) {
+          if (Contexts.back().IsTableGenCondOpe)
+            CurrentToken->setType(TT_TableGenCondOperatorColon);
+          next();
+        }
+        // In TableGen there must be Values in parens.
+        if (!parseTableGenValue())
+          return false;
+        continue;
+      }
+
       FormatToken *Tok = CurrentToken;
       if (!consumeToken())
         return false;
@@ -803,6 +853,8 @@ private:
           if (Left->BlockParameterCount > 1)
             Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = 0;
         }
+        if (Style.isTableGen() && Left->is(TT_TableGenListOpener))
+          CurrentToken->setType(TT_TableGenListCloser);
         next();
         return true;
       }
@@ -833,6 +885,19 @@ private:
         Left->setType(TT_ArrayInitializerLSquare);
       }
       FormatToken *Tok = CurrentToken;
+      if (Style.isTableGen()) {
+        if (CurrentToken->isOneOf(tok::comma, tok::minus, tok::ellipsis)) {
+          // '-' and '...' appears as a separator in slice.
+          next();
+        } else {
+          // In TableGen there must be a list of Values in square brackets.
+          // It must be ValueList or SliceElements.
+          if (!parseTableGenValue())
+            return false;
+        }
+        updateParameterCount(Left, Tok);
+        continue;
+      }
       if (!consumeToken())
         return false;
       updateParameterCount(Left, Tok);
@@ -840,6 +905,193 @@ private:
     return false;
   }
 
+  void skipToNextNonComment() {
+    next();
+    while (CurrentToken && CurrentToken->is(tok::comment))
+      next();
+  }
+
+  // Simplified parser for TableGen Value. Returns true on success.
+  // It consists of SimpleValues, SimpleValues with Suffixes, and Value followed
+  // by '#', paste operator.
+  // There also exists the case the Value is parsed as NameValue.
+  // In this case, the Value ends if '{' is found.
+  bool parseTableGenValue(bool ParseNameMode = false) {
+    if (!CurrentToken)
+      return false;
+    while (CurrentToken->is(tok::comment))
+      next();
+    if (!parseTableGenSimpleValue())
+      return false;
+    if (!CurrentToken)
+      return true;
+    // Value "#" [Value]
+    if (CurrentToken->is(tok::hash)) {
+      if (CurrentToken->Next &&
+          CurrentToken->Next->isOneOf(tok::colon, tok::semi, tok::l_brace)) {
+        // Trailing paste operator.
+        // These are only the allowed cases in TGParser::ParseValue().
+        CurrentToken->setType(TT_TableGenTrailingPasteOperator);
+        next();
+        return true;
+      }
+      FormatToken *HashTok = CurrentToken;
+      skipToNextNonComment();
+      HashTok->setType(TT_Unknown);
+      if (!parseTableGenValue(ParseNameMode))
+        return false;
+    }
+    // In name mode, '{' is regarded as the end of the value.
+    // See TGParser::ParseValue in TGParser.cpp
+    if (ParseNameMode && CurrentToken->is(tok::l_brace))
+      return true;
+    // These tokens indicates this is a value with suffixes.
+    if (CurrentToken->isOneOf(tok::l_brace, tok::l_square, tok::period)) {
+      CurrentToken->setType(TT_TableGenValueSuffix);
+      FormatToken *Suffix = CurrentToken;
+      skipToNextNonComment();
+      if (Suffix->is(tok::l_square))
+        return parseSquare();
+      if (Suffix->is(tok::l_brace)) {
+        Scopes.push_back(getScopeType(*Suffix));
+        return parseBrace();
+      }
+    }
+    return true;
+  }
+
+  // TokVarName    ::=  "$" ualpha (ualpha |  "0"..."9")*
+  // Appears as a part of DagArg.
+  // This does not change the current token on fail.
+  bool tryToParseTableGenTokVar() {
+    if (!CurrentToken)
+      return false;
+    if (CurrentToken->is(tok::identifier) &&
+        CurrentToken->TokenText.front() == '$') {
+      skipToNextNonComment();
+      return true;
+    }
+    return false;
+  }
+
+  // DagArg       ::=  Value [":" TokVarName] | TokVarName
+  // Appears as a part of SimpleValue6.
+  bool parseTableGenDAGArg() {
+    if (tryToParseTableGenTokVar())
+      return true;
+    if (parseTableGenValue()) {
+      if (CurrentToken && CurrentToken->is(tok::colon)) {
+        CurrentToken->setType(TT_TableGenDAGArgListColon);
+        skipToNextNonComment();
+        return tryToParseTableGenTokVar();
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
+  // This parses SimpleValue 6's inside part of "(" ")"
+  bool parseTableGenDAGArgAndList(FormatToken *Opener) {
+    if (!parseTableGenDAGArg())
+      return false;
+    // Parse the [DagArgList] part
+    bool FirstDAGArgListElm = true;
+    while (CurrentToken) {
+      if (!FirstDAGArgListElm && CurrentToken->is(tok::comma)) {
+        CurrentToken->setType(TT_TableGenDAGArgListComma);
+        skipToNextNonComment();
+      }
+      if (CurrentToken && CurrentToken->is(tok::r_paren)) {
+        CurrentToken->setType(TT_TableGenDAGArgCloser);
+        Opener->MatchingParen = CurrentToken;
+        CurrentToken->MatchingParen = Opener;
+        skipToNextNonComment();
+        return true;
+      }
+      if (!parseTableGenDAGArg())
+        return false;
+      FirstDAGArgListElm = false;
+    }
+    return false;
+  }
+
+  bool parseTableGenSimpleValue() {
+    assert(Style.isTableGen());
+    if (!CurrentToken)
+      return false;
+    FormatToken *Tok = CurrentToken;
+    skipToNextNonComment();
+    // SimpleValue 1, 2, 3: Literals
+    if (Tok->isOneOf(tok::numeric_constant, tok::string_literal,
+                     TT_TableGenMultiLineString, tok::kw_true, tok::kw_false,
+                     tok::question, tok::kw_int)) {
+      return true;
+    }
+    // SimpleValue 4: ValueList, Type
+    if (Tok->is(tok::l_brace)) {
+      Scopes.push_back(getScopeType(*Tok));
+      return parseBrace();
+    }
+    // SimpleValue 5: List initializer
+    if (Tok->is(tok::l_square)) {
+      Tok->setType(TT_TableGenListOpener);
+      if (!parseSquare())
+        return false;
+      if (Tok->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        return parseAngle();
+      }
+      return true;
+    }
+    // SimpleValue 6: DAGArg [DAGArgList]
+    // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
+    if (Tok->is(tok::l_paren)) {
+      Tok->setType(TT_TableGenDAGArgOpener);
+      return parseTableGenDAGArgAndList(Tok);
+    }
+    // SimpleValue 9: Bang operator
+    if (Tok->is(TT_TableGenBangOperator)) {
+      if (CurrentToken && CurrentToken->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        skipToNextNonComment();
+        if (!parseAngle())
+          return false;
+      }
+      if (!CurrentToken || CurrentToken->isNot(tok::l_paren))
+        return false;
+      skipToNextNonComment();
+      // FIXME: Hack using inheritance to child context
+      Contexts.back().IsTableGenBangOpe = true;
+      bool Result = parseParens();
+      Contexts.back().IsTableGenBangOpe = false;
+      return Result;
+    }
+    // SimpleValue 9: Cond operator
+    if (Tok->is(TT_TableGenCondOperator)) {
+      Tok = CurrentToken;
+      skipToNextNonComment();
+      if (!Tok || Tok->isNot(tok::l_paren))
+        return false;
+      bool Result = parseParens();
+      return Result;
+    }
+    // We have to check identifier at the last because the kind of bang/cond
+    // operators are also identifier.
+    // SimpleValue 7: Identifiers
+    if (Tok->is(tok::identifier)) {
+      // SimpleValue 8: Anonymous record
+      if (CurrentToken && CurrentToken->is(tok::less)) {
+        CurrentToken->setType(TT_TemplateOpener);
+        skipToNextNonComment();
+        return parseAngle();
+      }
+      return true;
+    }
+
+    return false;
+  }
+
   bool couldBeInStructArrayInitializer() const {
     if (Contexts.size() < 2)
       return false;
@@ -880,6 +1132,8 @@ private:
          OpeningBrace.getPreviousNonComment()->isNot(Keywords.kw_apostrophe))) {
       Contexts.back().VerilogMayBeConcatenation = true;
     }
+    if (Style.isTableGen())
+      Contexts.back().ColonIsDictLiteral = false;
 
     unsigned CommaCount = 0;
     while (CurrentToken) {
@@ -906,7 +1160,7 @@ private:
         FormatToken *Previous = CurrentToken->getPreviousNonComment();
         if (Previous->is(TT_JsTypeOptionalQuestion))
           Previous = Previous->getPreviousNonComment();
-        if ((CurrentToken->is(tok::colon) &&
+        if ((CurrentToken->is(tok::colon) && !Style.isTableGen() &&
              (!Contexts.back().ColonIsDictLiteral || !Style.isCpp())) ||
             Style.isProto()) {
           OpeningBrace.setType(TT_DictLiteral);
@@ -915,10 +1169,12 @@ private:
             Previous->setType(TT_SelectorName);
           }
         }
-        if (CurrentToken->is(tok::colon) && OpeningBrace.is(TT_Unknown))
+        if (CurrentToken->is(tok::colon) && OpeningBrace.is(TT_Unknown) &&
+            !Style.isTableGen()) {
           OpeningBrace.setType(TT_DictLiteral);
-        else if (Style.isJavaScript())
+        } else if (Style.isJavaScript()) {
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
+        }
       }
       if (CurrentToken->is(tok::comma)) {
         if (Style.isJavaScript())
@@ -989,6 +1245,9 @@ private:
     // operators.
     if (Tok->is(TT_VerilogTableItem))
       return true;
+    // Multi-line string itself is a single annotated token.
+    if (Tok->is(TT_TableGenMultiLineString))
+      return true;
     switch (Tok->Tok.getKind()) {
     case tok::plus:
     case tok::minus:
@@ -1119,6 +1378,10 @@ private:
         Tok->setType(TT_ObjCMethodExpr);
       } else if (Contexts.back().ContextKind == tok::l_paren &&
                  !Line.InPragmaDirective) {
+        if (Style.isTableGen() && Contexts.back().IsTableGenDAGArg) {
+          Tok->setType(TT_TableGenDAGArgListColon);
+          break;
+        }
         Tok->setType(TT_InlineASMColon);
       }
       break;
@@ -1130,6 +1393,14 @@ private:
         Tok->setType(TT_JsTypeOperator);
       break;
     case tok::kw_if:
+      if (Style.isTableGen()) {
+        // In TableGen it has the form 'if' <value> 'then'.
+        if (!parseTableGenValue())
+          return false;
+        if (CurrentToken && CurrentToken->is(Keywords.kw_then))
+          next(); // skip then
+        break;
+      }
       if (CurrentToken &&
           CurrentToken->isOneOf(tok::kw_constexpr, tok::identifier)) {
         next();
@@ -1235,6 +1506,8 @@ private:
       }
       break;
     case tok::l_square:
+      if (Style.isTableGen())
+        Tok->setType(TT_TableGenListOpener);
       if (!parseSquare())
         return false;
       break;
@@ -1264,6 +1537,8 @@ private:
           if (Previous && Previous->getType() != TT_DictLiteral)
             Previous->setType(TT_SelectorName);
         }
+        if (Style.isTableGen())
+          Tok->setType(TT_TemplateOpener);
       } else {
         Tok->setType(TT_BinaryOperator);
         NonTemplateLess.insert(Tok);
@@ -1423,11 +1698,28 @@ private:
         if (!Tok->getPreviousNonComment())
           Line.IsContinuation = true;
       }
+      if (Style.isTableGen()) {
+        if (Tok->is(Keywords.kw_assert)) {
+          if (!parseTableGenValue())
+            return false;
+        } else if (Tok->isOneOf(Keywords.kw_def, Keywords.kw_defm) &&
+                   (!Tok->Next ||
+                    !Tok->Next->isOneOf(tok::colon, tok::l_brace))) {
+          // The case NameValue appears.
+          if (!parseTableGenValue(true))
+            return false;
+        }
+      }
       break;
     case tok::arrow:
       if (Tok->Previous && Tok->Previous->is(tok::kw_noexcept))
         Tok->setType(TT_TrailingReturnArrow);
       break;
+    case tok::equal:
+      // In TableGen, there must be a value after "=";
+      if (Style.isTableGen() && !parseTableGenValue())
+        return false;
+      break;
     default:
       break;
     }
@@ -1757,6 +2049,9 @@ private:
     // Whether the braces may mean concatenation instead of structure or array
     // literal.
     bool VerilogMayBeConcatenation = false;
+    bool IsTableGenDAGArg = false;
+    bool IsTableGenBangOpe = false;
+    bool IsTableGenCondOpe = false;
     enum {
       Unknown,
       // Like the part after `:` in a constructor.
@@ -2061,6 +2356,9 @@ private:
         // In JavaScript, `interface X { foo?(): bar; }` is an optional method
         // on the interface, not a ternary expression.
         Current.setType(TT_JsTypeOptionalQuestion);
+      } else if (Style.isTableGen()) {
+        // In TableGen, '?' is just an identifier like token.
+        Current.setType(TT_Unknown);
       } else {
         Current.setType(TT_ConditionalExpr);
       }
@@ -2239,6 +2537,9 @@ private:
       // keywords such as let and def* defines names.
       if (Keywords.isTableGenDefinition(*PreviousNotConst))
         return true;
+      // Otherwise C++ style declarations is available only inside the brace.
+      if (Contexts.back().ContextKind != tok::l_brace)
+        return false;
     }
 
     bool IsPPKeyword = PreviousNotConst->is(tok::identifier) &&
@@ -5184,8 +5485,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     // concept ...
     if (Right.is(tok::kw_concept))
       return Style.BreakBeforeConceptDeclarations == FormatStyle::BBCDS_Always;
-    return Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Yes ||
-           (Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Leave &&
+    return Style.BreakTemplateDeclarations == FormatStyle::BTDS_Yes ||
+           (Style.BreakTemplateDeclarations == FormatStyle::BTDS_Leave &&
             Right.NewlinesBefore > 0);
   }
   if (Left.ClosesRequiresClause && Right.isNot(tok::semi)) {
@@ -5620,7 +5921,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   if (Right.is(TT_RequiresClause))
     return true;
   if (Left.ClosesTemplateDeclaration) {
-    return Style.AlwaysBreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
+    return Style.BreakTemplateDeclarations != FormatStyle::BTDS_Leave ||
            Right.NewlinesBefore > 0;
   }
   if (Left.is(TT_FunctionAnnotationRParen))
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index d4f9b3f9df52..8f6453a25d9d 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -495,12 +495,15 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     do {
       NextTok = Tokens->getNextToken();
     } while (NextTok->is(tok::comment));
-    while (NextTok->is(tok::hash) && !Line->InMacroBody) {
-      NextTok = Tokens->getNextToken();
-      do {
+    if (!Style.isTableGen()) {
+      // InTableGen, '#' is like binary operator. Not a preprocessor directive.
+      while (NextTok->is(tok::hash) && !Line->InMacroBody) {
         NextTok = Tokens->getNextToken();
-      } while (NextTok->is(tok::comment) ||
-               (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)));
+        do {
+          NextTok = Tokens->getNextToken();
+        } while (NextTok->is(tok::comment) ||
+                 (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof)));
+      }
     }
 
     switch (Tok->Tok.getKind()) {
@@ -2515,7 +2518,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseChildBlock();
       break;
     case tok::r_paren:
-      if (!MightBeStmtExpr &&
+      if (!MightBeStmtExpr && !Line->InMacroBody &&
           Style.RemoveParentheses > FormatStyle::RPS_Leave) {
         const auto *Prev = LeftParen->Previous;
         const auto *Next = Tokens->peekNextToken();
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 19abcac2befb..369816e89e1d 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -66,7 +66,8 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     if (!File)
       DepCollector.maybeAddDependency(FileName, /*FromModule*/ false,
diff --git a/clang/lib/Frontend/DependencyGraph.cpp b/clang/lib/Frontend/DependencyGraph.cpp
index b471471f3528..20e5f233e224 100644
--- a/clang/lib/Frontend/DependencyGraph.cpp
+++ b/clang/lib/Frontend/DependencyGraph.cpp
@@ -49,7 +49,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
 
   void EndOfMainFile() override {
@@ -68,8 +69,8 @@ void clang::AttachDependencyGraphGen(Preprocessor &PP, StringRef OutputFile,
 void DependencyGraphCallback::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   if (!File)
     return;
 
diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp
index 939e611e5489..b88cb60ebdd2 100644
--- a/clang/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp
@@ -55,7 +55,8 @@ struct ModuleDependencyPPCallbacks : public PPCallbacks {
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     if (!File)
       return;
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index 62373b23b82e..9b0ef30a1412 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -98,7 +98,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     // File is std::nullopt if it wasn't found.
     // (We have some false negatives if PP recovered e.g. <foo> -> "foo")
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 7f5f66906823..a26d2c3ab858 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -153,7 +153,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void Ident(SourceLocation Loc, StringRef str) override;
   void PragmaMessage(SourceLocation Loc, StringRef Namespace,
@@ -401,8 +402,8 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
 void PrintPPOutputPPCallbacks::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   // In -dI mode, dump #include directives prior to dumping their content or
   // interpretation. Similar for -fkeep-system-includes.
   if (DumpIncludeDirectives || (KeepSystemIncludes && isSystem(FileType))) {
@@ -418,14 +419,14 @@ void PrintPPOutputPPCallbacks::InclusionDirective(
   }
 
   // When preprocessing, turn implicit imports into module import pragmas.
-  if (Imported) {
+  if (ModuleImported) {
     switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) {
     case tok::pp_include:
     case tok::pp_import:
     case tok::pp_include_next:
       MoveToLine(HashLoc, /*RequireStartOfLine=*/true);
       *OS << "#pragma clang module import "
-          << Imported->getFullModuleName(true)
+          << SuggestedModule->getFullModuleName(true)
           << " /* clang -E: implicit import for "
           << "#" << PP.getSpelling(IncludeTok) << " "
           << (IsAngled ? '<' : '"') << FileName << (IsAngled ? '>' : '"')
diff --git a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
index b6b37461089e..1462058003b3 100644
--- a/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
+++ b/clang/lib/Frontend/Rewrite/InclusionRewriter.cpp
@@ -75,7 +75,8 @@ private:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override;
   void If(SourceLocation Loc, SourceRange ConditionRange,
           ConditionValueKind ConditionValue) override;
@@ -189,9 +190,10 @@ void InclusionRewriter::InclusionDirective(
     StringRef /*FileName*/, bool /*IsAngled*/,
     CharSourceRange /*FilenameRange*/, OptionalFileEntryRef /*File*/,
     StringRef /*SearchPath*/, StringRef /*RelativePath*/,
-    const Module *Imported, SrcMgr::CharacteristicKind FileType) {
-  if (Imported) {
-    auto P = ModuleIncludes.insert(std::make_pair(HashLoc, Imported));
+    const Module *SuggestedModule, bool ModuleImported,
+    SrcMgr::CharacteristicKind FileType) {
+  if (ModuleImported) {
+    auto P = ModuleIncludes.insert(std::make_pair(HashLoc, SuggestedModule));
     (void)P;
     assert(P.second && "Unexpected revisitation of the same include directive");
   } else
diff --git a/clang/lib/Headers/ia32intrin.h b/clang/lib/Headers/ia32intrin.h
index 1b979770e196..8e65f232a0de 100644
--- a/clang/lib/Headers/ia32intrin.h
+++ b/clang/lib/Headers/ia32intrin.h
@@ -26,8 +26,8 @@
 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
 #endif
 
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -43,8 +43,8 @@ __bsfd(int __A) {
   return __builtin_ctz((unsigned int)__A);
 }
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -90,8 +90,8 @@ _bswap(int __A) {
   return (int)__builtin_bswap32((unsigned int)__A);
 }
 
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -108,8 +108,8 @@ _bswap(int __A) {
 /// \see __bsfd
 #define _bit_scan_forward(A) __bsfd((A))
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -127,8 +127,8 @@ _bswap(int __A) {
 #define _bit_scan_reverse(A) __bsrd((A))
 
 #ifdef __x86_64__
-/// Find the first set bit starting from the lsb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -143,8 +143,8 @@ __bsfq(long long __A) {
   return (long long)__builtin_ctzll((unsigned long long)__A);
 }
 
-/// Find the first set bit starting from the msb. Result is undefined if
-///    input is 0.
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if input is 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -159,7 +159,7 @@ __bsrq(long long __A) {
   return 63 - __builtin_clzll((unsigned long long)__A);
 }
 
-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@@ -175,7 +175,7 @@ __bswapq(long long __A) {
   return (long long)__builtin_bswap64((unsigned long long)__A);
 }
 
-/// Swaps the bytes in the input. Converting little endian to big endian or
+/// Swaps the bytes in the input, converting little endian to big endian or
 ///    vice versa.
 ///
 /// \headerfile <x86intrin.h>
@@ -198,7 +198,7 @@ __bswapq(long long __A) {
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 32-bit integer operand.
@@ -220,7 +220,7 @@ __popcntd(unsigned int __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 32-bit integer operand.
@@ -235,7 +235,7 @@ __popcntd(unsigned int __A)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param __A
 ///    An unsigned 64-bit integer operand.
@@ -257,7 +257,7 @@ __popcntq(unsigned long long __A)
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c POPCNT instruction or a
-///    a sequence of arithmetic and logic ops to calculate it.
+///    sequence of arithmetic and logic operations to calculate it.
 ///
 /// \param A
 ///    An unsigned 64-bit integer operand.
@@ -268,7 +268,7 @@ __popcntq(unsigned long long __A)
 #endif /* __x86_64__ */
 
 #ifdef __x86_64__
-/// Returns the program status and control \c RFLAGS register with the \c VM
+/// Returns the program status-and-control \c RFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -282,7 +282,7 @@ __readeflags(void)
   return __builtin_ia32_readeflags_u64();
 }
 
-/// Writes the specified value to the program status and control \c RFLAGS
+/// Writes the specified value to the program status-and-control \c RFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@@ -298,7 +298,7 @@ __writeeflags(unsigned long long __f)
 }
 
 #else /* !__x86_64__ */
-/// Returns the program status and control \c EFLAGS register with the \c VM
+/// Returns the program status-and-control \c EFLAGS register with the \c VM
 ///    and \c RF flags cleared.
 ///
 /// \headerfile <x86intrin.h>
@@ -312,7 +312,7 @@ __readeflags(void)
   return __builtin_ia32_readeflags_u32();
 }
 
-/// Writes the specified value to the program status and control \c EFLAGS
+/// Writes the specified value to the program status-and-control \c EFLAGS
 ///    register. Reserved bits are not affected.
 ///
 /// \headerfile <x86intrin.h>
@@ -328,7 +328,7 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */
 
-/// Cast a 32-bit float value to a 32-bit unsigned integer value.
+/// Casts a 32-bit float value to a 32-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -337,13 +337,13 @@ __writeeflags(unsigned int __f)
 ///
 /// \param __A
 ///    A 32-bit float value.
-/// \returns a 32-bit unsigned integer containing the converted value.
+/// \returns A 32-bit unsigned integer containing the converted value.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
 _castf32_u32(float __A) {
   return __builtin_bit_cast(unsigned int, __A);
 }
 
-/// Cast a 64-bit float value to a 64-bit unsigned integer value.
+/// Casts a 64-bit float value to a 64-bit unsigned integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -352,13 +352,13 @@ _castf32_u32(float __A) {
 ///
 /// \param __A
 ///    A 64-bit float value.
-/// \returns a 64-bit unsigned integer containing the converted value.
+/// \returns A 64-bit unsigned integer containing the converted value.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
 _castf64_u64(double __A) {
   return __builtin_bit_cast(unsigned long long, __A);
 }
 
-/// Cast a 32-bit unsigned integer value to a 32-bit float value.
+/// Casts a 32-bit unsigned integer value to a 32-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -367,13 +367,13 @@ _castf64_u64(double __A) {
 ///
 /// \param __A
 ///    A 32-bit unsigned integer value.
-/// \returns a 32-bit float value containing the converted value.
+/// \returns A 32-bit float value containing the converted value.
 static __inline__ float __DEFAULT_FN_ATTRS_CAST
 _castu32_f32(unsigned int __A) {
   return __builtin_bit_cast(float, __A);
 }
 
-/// Cast a 64-bit unsigned integer value to a 64-bit float value.
+/// Casts a 64-bit unsigned integer value to a 64-bit float value.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -382,7 +382,7 @@ _castu32_f32(unsigned int __A) {
 ///
 /// \param __A
 ///    A 64-bit unsigned integer value.
-/// \returns a 64-bit float value containing the converted value.
+/// \returns A 64-bit float value containing the converted value.
 static __inline__ double __DEFAULT_FN_ATTRS_CAST
 _castu64_f64(unsigned long long __A) {
   return __builtin_bit_cast(double, __A);
@@ -470,7 +470,7 @@ __crc32q(unsigned long long __C, unsigned long long __D)
 }
 #endif /* __x86_64__ */
 
-/// Reads the specified performance monitoring counter. Refer to your
+/// Reads the specified performance-monitoring counter. Refer to your
 ///    processor's documentation to determine which performance counters are
 ///    supported.
 ///
@@ -487,7 +487,7 @@ __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
 }
 
-/// Reads the processor's time stamp counter and the \c IA32_TSC_AUX MSR
+/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
 ///    \c (0xc0000103).
 ///
 /// \headerfile <x86intrin.h>
@@ -495,14 +495,14 @@ __rdpmc(int __A) {
 /// This intrinsic corresponds to the \c RDTSCP instruction.
 ///
 /// \param __A
-///    Address of where to store the 32-bit \c IA32_TSC_AUX value.
-/// \returns The 64-bit value of the time stamp counter.
+///    The address of where to store the 32-bit \c IA32_TSC_AUX value.
+/// \returns The 64-bit value of the time-stamp counter.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdtscp(unsigned int *__A) {
   return __builtin_ia32_rdtscp(__A);
 }
 
-/// Reads the processor's time stamp counter.
+/// Reads the processor's time-stamp counter.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -512,7 +512,7 @@ __rdtscp(unsigned int *__A) {
 ///
 /// This intrinsic corresponds to the \c RDTSC instruction.
 ///
-/// \returns The 64-bit value of the time stamp counter.
+/// \returns The 64-bit value of the time-stamp counter.
 #define _rdtsc() __rdtsc()
 
 /// Reads the specified performance monitoring counter. Refer to your
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 521c473dd169..9c103d98af8c 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -16,7 +16,7 @@
  * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
  * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
  * to the clang resource header until that is fully supported.  The
- * `stdatomic.h` header requires C++ 23 or newer.
+ * `stdatomic.h` header requires C++23 or newer.
  */
 #if __STDC_HOSTED__ &&                                                         \
     __has_include_next(<stdatomic.h>) &&                                       \
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 980f865cf24c..0971daa1f366 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -369,7 +369,7 @@ static void skipBlockComment(const char *&First, const char *const End) {
     }
 }
 
-/// \returns True if the current single quotation mark character is a C++ 14
+/// \returns True if the current single quotation mark character is a C++14
 /// digit separator.
 static bool isQuoteCppDigitSeparator(const char *const Start,
                                      const char *const Cur,
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index a980f4bcbae1..0b22139ebe81 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2253,26 +2253,27 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // FIXME: We do not have a good way to disambiguate C++ clang modules from
   // C++ standard modules (other than use/non-use of Header Units).
-  Module *SM = SuggestedModule.getModule();
 
-  bool MaybeTranslateInclude =
-      Action == Enter && File && SM && !SM->isForBuilding(getLangOpts());
+  Module *ModuleToImport = SuggestedModule.getModule();
+
+  bool MaybeTranslateInclude = Action == Enter && File && ModuleToImport &&
+                               !ModuleToImport->isForBuilding(getLangOpts());
 
   // Maybe a usable Header Unit
   bool UsableHeaderUnit = false;
-  if (getLangOpts().CPlusPlusModules && SM && SM->isHeaderUnit()) {
+  if (getLangOpts().CPlusPlusModules && ModuleToImport &&
+      ModuleToImport->isHeaderUnit()) {
     if (TrackGMFState.inGMF() || IsImportDecl)
       UsableHeaderUnit = true;
     else if (!IsImportDecl) {
       // This is a Header Unit that we do not include-translate
-      SuggestedModule = ModuleMap::KnownHeader();
-      SM = nullptr;
+      ModuleToImport = nullptr;
     }
   }
   // Maybe a usable clang header module.
   bool UsableClangHeaderModule =
-      (getLangOpts().CPlusPlusModules || getLangOpts().Modules) && SM &&
-      !SM->isHeaderUnit();
+      (getLangOpts().CPlusPlusModules || getLangOpts().Modules) &&
+      ModuleToImport && !ModuleToImport->isHeaderUnit();
 
   // Determine whether we should try to import the module for this #include, if
   // there is one. Don't do so if precompiled module support is disabled or we
@@ -2282,12 +2283,11 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // unavailable, diagnose the situation and bail out.
     // FIXME: Remove this; loadModule does the same check (but produces
     // slightly worse diagnostics).
-    if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(),
-                               *SuggestedModule.getModule(),
+    if (checkModuleIsAvailable(getLangOpts(), getTargetInfo(), *ModuleToImport,
                                getDiagnostics())) {
       Diag(FilenameTok.getLocation(),
            diag::note_implicit_top_level_module_import_here)
-          << SuggestedModule.getModule()->getTopLevelModuleName();
+          << ModuleToImport->getTopLevelModuleName();
       return {ImportAction::None};
     }
 
@@ -2295,7 +2295,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // FIXME: Should we have a second loadModule() overload to avoid this
     // extra lookup step?
     SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Path;
-    for (Module *Mod = SM; Mod; Mod = Mod->Parent)
+    for (Module *Mod = ModuleToImport; Mod; Mod = Mod->Parent)
       Path.push_back(std::make_pair(getIdentifierInfo(Mod->Name),
                                     FilenameTok.getLocation()));
     std::reverse(Path.begin(), Path.end());
@@ -2306,12 +2306,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
     // Load the module to import its macros. We'll make the declarations
     // visible when the parser gets here.
-    // FIXME: Pass SuggestedModule in here rather than converting it to a path
+    // FIXME: Pass ModuleToImport in here rather than converting it to a path
     // and making the module loader convert it back again.
     ModuleLoadResult Imported = TheModuleLoader.loadModule(
         IncludeTok.getLocation(), Path, Module::Hidden,
         /*IsInclusionDirective=*/true);
-    assert((Imported == nullptr || Imported == SuggestedModule.getModule()) &&
+    assert((Imported == nullptr || Imported == ModuleToImport) &&
            "the imported module is different than the suggested one");
 
     if (Imported) {
@@ -2323,8 +2323,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
       // was in the directory of an umbrella header, for instance), but no
       // actual module containing it exists (because the umbrella header is
       // incomplete).  Treat this as a textual inclusion.
-      SuggestedModule = ModuleMap::KnownHeader();
-      SM = nullptr;
+      ModuleToImport = nullptr;
     } else if (Imported.isConfigMismatch()) {
       // On a configuration mismatch, enter the header textually. We still know
       // that it's part of the corresponding module.
@@ -2365,7 +2364,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
   // this file will have no effect.
   if (Action == Enter && File &&
       !HeaderInfo.ShouldEnterIncludeFile(*this, *File, EnterOnce,
-                                         getLangOpts().Modules, SM,
+                                         getLangOpts().Modules, ModuleToImport,
                                          IsFirstIncludeOfFile)) {
     // C++ standard modules:
     // If we are not in the GMF, then we textually include only
@@ -2380,7 +2379,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     if (UsableHeaderUnit && !getLangOpts().CompilingPCH)
       Action = TrackGMFState.inGMF() ? Import : Skip;
     else
-      Action = (SuggestedModule && !getLangOpts().CompilingPCH) ? Import : Skip;
+      Action = (ModuleToImport && !getLangOpts().CompilingPCH) ? Import : Skip;
   }
 
   // Check for circular inclusion of the main file.
@@ -2400,8 +2399,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // FIXME: Use a different callback for a pp-import?
     Callbacks->InclusionDirective(HashLoc, IncludeTok, LookupFilename, isAngled,
                                   FilenameRange, File, SearchPath, RelativePath,
-                                  Action == Import ? SuggestedModule.getModule()
-                                                   : nullptr,
+                                  SuggestedModule.getModule(), Action == Import,
                                   FileCharacter);
     if (Action == Skip && File)
       Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
@@ -2412,7 +2410,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // If this is a C++20 pp-import declaration, diagnose if we didn't find any
   // module corresponding to the named header.
-  if (IsImportDecl && !SuggestedModule) {
+  if (IsImportDecl && !ModuleToImport) {
     Diag(FilenameTok, diag::err_header_import_not_header_unit)
       << OriginalFilename << File->getName();
     return {ImportAction::None};
@@ -2517,8 +2515,8 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
   switch (Action) {
   case Skip:
     // If we don't need to enter the file, stop now.
-    if (SM)
-      return {ImportAction::SkippedModuleImport, SM};
+    if (ModuleToImport)
+      return {ImportAction::SkippedModuleImport, ModuleToImport};
     return {ImportAction::None};
 
   case IncludeLimitReached:
@@ -2528,15 +2526,15 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   case Import: {
     // If this is a module import, make it visible if needed.
-    assert(SM && "no module to import");
+    assert(ModuleToImport && "no module to import");
 
-    makeModuleVisible(SM, EndLoc);
+    makeModuleVisible(ModuleToImport, EndLoc);
 
     if (IncludeTok.getIdentifierInfo()->getPPKeywordID() ==
         tok::pp___include_macros)
       return {ImportAction::None};
 
-    return {ImportAction::ModuleImport, SM};
+    return {ImportAction::ModuleImport, ModuleToImport};
   }
 
   case Enter:
@@ -2573,13 +2571,14 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
 
   // Determine if we're switching to building a new submodule, and which one.
   // This does not apply for C++20 modules header units.
-  if (SM && !SM->isHeaderUnit()) {
-    if (SM->getTopLevelModule()->ShadowingModule) {
+  if (ModuleToImport && !ModuleToImport->isHeaderUnit()) {
+    if (ModuleToImport->getTopLevelModule()->ShadowingModule) {
       // We are building a submodule that belongs to a shadowed module. This
       // means we find header files in the shadowed module.
-      Diag(SM->DefinitionLoc, diag::err_module_build_shadowed_submodule)
-          << SM->getFullModuleName();
-      Diag(SM->getTopLevelModule()->ShadowingModule->DefinitionLoc,
+      Diag(ModuleToImport->DefinitionLoc,
+           diag::err_module_build_shadowed_submodule)
+          << ModuleToImport->getFullModuleName();
+      Diag(ModuleToImport->getTopLevelModule()->ShadowingModule->DefinitionLoc,
            diag::note_previous_definition);
       return {ImportAction::None};
     }
@@ -2591,21 +2590,22 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     // that behaves the same as the header would behave in a compilation using
     // that PCH, which means we should enter the submodule. We need to teach
     // the AST serialization layer to deal with the resulting AST.
-    if (getLangOpts().CompilingPCH && SM->isForBuilding(getLangOpts()))
+    if (getLangOpts().CompilingPCH &&
+        ModuleToImport->isForBuilding(getLangOpts()))
       return {ImportAction::None};
 
     assert(!CurLexerSubmodule && "should not have marked this as a module yet");
-    CurLexerSubmodule = SM;
+    CurLexerSubmodule = ModuleToImport;
 
     // Let the macro handling code know that any future macros are within
     // the new submodule.
-    EnterSubmodule(SM, EndLoc, /*ForPragma*/ false);
+    EnterSubmodule(ModuleToImport, EndLoc, /*ForPragma*/ false);
 
     // Let the parser know that any future declarations are within the new
     // submodule.
     // FIXME: There's no point doing this if we're handling a #__include_macros
     // directive.
-    return {ImportAction::ModuleBegin, SM};
+    return {ImportAction::ModuleBegin, ModuleToImport};
   }
 
   assert(!IsImportDecl && "failed to diagnose missing module for import decl");
diff --git a/clang/lib/Lex/PreprocessingRecord.cpp b/clang/lib/Lex/PreprocessingRecord.cpp
index aab6a2bed89d..be5aac7ef31b 100644
--- a/clang/lib/Lex/PreprocessingRecord.cpp
+++ b/clang/lib/Lex/PreprocessingRecord.cpp
@@ -472,8 +472,8 @@ void PreprocessingRecord::MacroUndefined(const Token &Id,
 void PreprocessingRecord::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
   InclusionDirective::InclusionKind Kind = InclusionDirective::Include;
 
   switch (IncludeTok.getIdentifierInfo()->getPPKeywordID()) {
@@ -506,10 +506,9 @@ void PreprocessingRecord::InclusionDirective(
       EndLoc = EndLoc.getLocWithOffset(-1); // the InclusionDirective expects
                                             // a token range.
   }
-  clang::InclusionDirective *ID =
-      new (*this) clang::InclusionDirective(*this, Kind, FileName, !IsAngled,
-                                            (bool)Imported, File,
-                                            SourceRange(HashLoc, EndLoc));
+  clang::InclusionDirective *ID = new (*this) clang::InclusionDirective(
+      *this, Kind, FileName, !IsAngled, ModuleImported, File,
+      SourceRange(HashLoc, EndLoc));
   addPreprocessedEntity(ID);
 }
 
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 1fee9f82b3e6..e099d077198d 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -54,7 +54,7 @@ OpenACCDirectiveKindEx getOpenACCDirectiveKind(Token Tok) {
           .Case("declare", OpenACCDirectiveKind::Declare)
           .Case("init", OpenACCDirectiveKind::Init)
           .Case("shutdown", OpenACCDirectiveKind::Shutdown)
-          .Case("set", OpenACCDirectiveKind::Shutdown)
+          .Case("set", OpenACCDirectiveKind::Set)
           .Case("update", OpenACCDirectiveKind::Update)
           .Case("wait", OpenACCDirectiveKind::Wait)
           .Default(OpenACCDirectiveKind::Invalid);
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 78b9f324e139..8239ba49429d 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2297,7 +2297,8 @@ public:
 
   void handleUnsafeVariableGroup(const VarDecl *Variable,
                                  const VariableGroupsManager &VarGrpMgr,
-                                 FixItList &&Fixes, const Decl *D) override {
+                                 FixItList &&Fixes, const Decl *D,
+                                 const FixitStrategy &VarTargetTypes) override {
     assert(!SuggestSuggestions &&
            "Unsafe buffer usage fixits displayed without suggestions!");
     S.Diag(Variable->getLocation(), diag::warn_unsafe_buffer_variable)
@@ -2312,7 +2313,18 @@ public:
       // NOT explain how the variables are grouped as the reason is non-trivial
       // and irrelavant to users' experience:
       const auto VarGroupForVD = VarGrpMgr.getGroupOfVar(Variable, &BriefMsg);
-      unsigned FixItStrategy = 0; // For now we only have 'std::span' strategy
+      unsigned FixItStrategy = 0;
+      switch (VarTargetTypes.lookup(Variable)) {
+      case clang::FixitStrategy::Kind::Span:
+        FixItStrategy = 0;
+        break;
+      case clang::FixitStrategy::Kind::Array:
+        FixItStrategy = 1;
+        break;
+      default:
+        assert(false && "We support only std::span and std::array");
+      };
+
       const auto &FD =
           S.Diag(Variable->getLocation(),
                  BriefMsg ? diag::note_unsafe_buffer_variable_fixit_together
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 2d4e6d1d058c..cfb653e665ea 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1393,7 +1393,8 @@ void Sema::ActOnEndOfTranslationUnit() {
               Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl)
                   << /*function=*/0 << DiagD << DiagRange;
           }
-        } else {
+        } else if (!FD->isTargetMultiVersion() ||
+                   FD->isTargetMultiVersionDefault()) {
           if (FD->getDescribedFunctionTemplate())
             Diag(DiagD->getLocation(), diag::warn_unused_template)
                 << /*function=*/0 << DiagD << DiagRange;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b071a02ca371..afe2673479e4 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -7162,13 +7162,11 @@ static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
 
   // As a special case, transparent unions initialized with zero are
   // considered null for the purposes of the nonnull attribute.
-  if (const RecordType *UT = Expr->getType()->getAsUnionType()) {
-    if (UT->getDecl()->hasAttr<TransparentUnionAttr>())
-      if (const CompoundLiteralExpr *CLE =
-          dyn_cast<CompoundLiteralExpr>(Expr))
-        if (const InitListExpr *ILE =
-            dyn_cast<InitListExpr>(CLE->getInitializer()))
-          Expr = ILE->getInit(0);
+  if (const RecordType *UT = Expr->getType()->getAsUnionType();
+      UT && UT->getDecl()->hasAttr<TransparentUnionAttr>()) {
+    if (const auto *CLE = dyn_cast<CompoundLiteralExpr>(Expr))
+      if (const auto *ILE = dyn_cast<InitListExpr>(CLE->getInitializer()))
+        Expr = ILE->getInit(0);
   }
 
   bool Result;
@@ -16129,10 +16127,10 @@ static void CheckConditionalOperator(Sema &S, AbstractConditionalOperator *E,
 /// Check conversion of given expression to boolean.
 /// Input argument E is a logical expression.
 static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
-  // While C23 does have bool as a keyword, we still need to run the bool-like
-  // conversion checks as bools are still not used as the return type from
-  // "boolean" operators or as the input type for conditional operators.
-  if (S.getLangOpts().Bool && !S.getLangOpts().C23)
+  // Run the bool-like conversion checks only for C since there bools are
+  // still not used as the return type from "boolean" operators or as the input
+  // type for conditional operators.
+  if (S.getLangOpts().CPlusPlus)
     return;
   if (E->IgnoreParenImpCasts()->getType()->isAtomicType())
     return;
@@ -16652,6 +16650,7 @@ class SequenceChecker : public ConstEvaluatedExprVisitor<SequenceChecker> {
     struct Value {
       explicit Value(unsigned Parent) : Parent(Parent), Merged(false) {}
       unsigned Parent : 31;
+      LLVM_PREFERRED_TYPE(bool)
       unsigned Merged : 1;
     };
     SmallVector<Value, 8> Values;
@@ -17183,7 +17182,7 @@ public:
     //  evaluates to true.
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult);
-    bool ShouldVisitRHS = !EvalOK || (EvalOK && !EvalResult);
+    bool ShouldVisitRHS = !EvalOK || !EvalResult;
     if (ShouldVisitRHS) {
       Region = RHSRegion;
       Visit(BO->getRHS());
@@ -17215,7 +17214,7 @@ public:
     //  [...] the second operand is not evaluated if the first operand is false.
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(BO->getLHS(), EvalResult);
-    bool ShouldVisitRHS = !EvalOK || (EvalOK && EvalResult);
+    bool ShouldVisitRHS = !EvalOK || EvalResult;
     if (ShouldVisitRHS) {
       Region = RHSRegion;
       Visit(BO->getRHS());
@@ -17266,8 +17265,8 @@ public:
     // evaluated. [...]
     bool EvalResult = false;
     bool EvalOK = Eval.evaluate(CO->getCond(), EvalResult);
-    bool ShouldVisitTrueExpr = !EvalOK || (EvalOK && EvalResult);
-    bool ShouldVisitFalseExpr = !EvalOK || (EvalOK && !EvalResult);
+    bool ShouldVisitTrueExpr = !EvalOK || EvalResult;
+    bool ShouldVisitFalseExpr = !EvalOK || !EvalResult;
     if (ShouldVisitTrueExpr) {
       Region = TrueRegion;
       Visit(CO->getTrueExpr());
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 18a5d93ab8e8..be23c0fffe05 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8357,28 +8357,40 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 
   unsigned WarningDiag = diag::warn_decl_shadow;
   SourceLocation CaptureLoc;
-  if (isa<VarDecl>(D) && isa<VarDecl>(ShadowedDecl) && NewDC &&
-      isa<CXXMethodDecl>(NewDC)) {
+  if (isa<VarDecl>(D) && NewDC && isa<CXXMethodDecl>(NewDC)) {
     if (const auto *RD = dyn_cast<CXXRecordDecl>(NewDC->getParent())) {
       if (RD->isLambda() && OldDC->Encloses(NewDC->getLexicalParent())) {
-        if (RD->getLambdaCaptureDefault() == LCD_None) {
-          // Try to avoid warnings for lambdas with an explicit capture list.
+        if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
           const auto *LSI = cast<LambdaScopeInfo>(getCurFunction());
-          // Warn only when the lambda captures the shadowed decl explicitly.
-          CaptureLoc = getCaptureLocation(LSI, cast<VarDecl>(ShadowedDecl));
-          if (CaptureLoc.isInvalid())
-            WarningDiag = diag::warn_decl_shadow_uncaptured_local;
-        } else {
-          // Remember that this was shadowed so we can avoid the warning if the
-          // shadowed decl isn't captured and the warning settings allow it.
+          if (RD->getLambdaCaptureDefault() == LCD_None) {
+            // Try to avoid warnings for lambdas with an explicit capture
+            // list. Warn only when the lambda captures the shadowed decl
+            // explicitly.
+            CaptureLoc = getCaptureLocation(LSI, VD);
+            if (CaptureLoc.isInvalid())
+              WarningDiag = diag::warn_decl_shadow_uncaptured_local;
+          } else {
+            // Remember that this was shadowed so we can avoid the warning if
+            // the shadowed decl isn't captured and the warning settings allow
+            // it.
+            cast<LambdaScopeInfo>(getCurFunction())
+                ->ShadowingDecls.push_back({D, VD});
+            return;
+          }
+        }
+        if (isa<FieldDecl>(ShadowedDecl)) {
+          // If lambda can capture this, then emit default shadowing warning,
+          // Otherwise it is not really a shadowing case since field is not
+          // available in lambda's body.
+          // At this point we don't know that lambda can capture this, so
+          // remember that this was shadowed and delay until we know.
           cast<LambdaScopeInfo>(getCurFunction())
-              ->ShadowingDecls.push_back(
-                  {cast<VarDecl>(D), cast<VarDecl>(ShadowedDecl)});
+              ->ShadowingDecls.push_back({D, ShadowedDecl});
           return;
         }
       }
-
-      if (cast<VarDecl>(ShadowedDecl)->hasLocalStorage()) {
+      if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl);
+          VD && VD->hasLocalStorage()) {
         // A variable can't shadow a local variable in an enclosing scope, if
         // they are separated by a non-capturing declaration context.
         for (DeclContext *ParentDC = NewDC;
@@ -8429,19 +8441,28 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
 /// when these variables are captured by the lambda.
 void Sema::DiagnoseShadowingLambdaDecls(const LambdaScopeInfo *LSI) {
   for (const auto &Shadow : LSI->ShadowingDecls) {
-    const VarDecl *ShadowedDecl = Shadow.ShadowedDecl;
+    const NamedDecl *ShadowedDecl = Shadow.ShadowedDecl;
     // Try to avoid the warning when the shadowed decl isn't captured.
-    SourceLocation CaptureLoc = getCaptureLocation(LSI, ShadowedDecl);
     const DeclContext *OldDC = ShadowedDecl->getDeclContext();
-    Diag(Shadow.VD->getLocation(), CaptureLoc.isInvalid()
-                                       ? diag::warn_decl_shadow_uncaptured_local
-                                       : diag::warn_decl_shadow)
-        << Shadow.VD->getDeclName()
-        << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
-    if (!CaptureLoc.isInvalid())
-      Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
-          << Shadow.VD->getDeclName() << /*explicitly*/ 0;
-    Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    if (const auto *VD = dyn_cast<VarDecl>(ShadowedDecl)) {
+      SourceLocation CaptureLoc = getCaptureLocation(LSI, VD);
+      Diag(Shadow.VD->getLocation(),
+           CaptureLoc.isInvalid() ? diag::warn_decl_shadow_uncaptured_local
+                                  : diag::warn_decl_shadow)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      if (CaptureLoc.isValid())
+        Diag(CaptureLoc, diag::note_var_explicitly_captured_here)
+            << Shadow.VD->getDeclName() << /*explicitly*/ 0;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    } else if (isa<FieldDecl>(ShadowedDecl)) {
+      Diag(Shadow.VD->getLocation(),
+           LSI->isCXXThisCaptured() ? diag::warn_decl_shadow
+                                    : diag::warn_decl_shadow_uncaptured_local)
+          << Shadow.VD->getDeclName()
+          << computeShadowedDeclKind(ShadowedDecl, OldDC) << OldDC;
+      Diag(ShadowedDecl->getLocation(), diag::note_previous_declaration);
+    }
   }
 }
 
@@ -9759,7 +9780,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
   SmallVector<TemplateParameterList *, 4> TemplateParamLists;
   llvm::append_range(TemplateParamLists, TemplateParamListsRef);
   if (TemplateParameterList *Invented = D.getInventedTemplateParameterList()) {
-    if (!TemplateParamLists.empty() &&
+    if (!TemplateParamLists.empty() && !TemplateParamLists.back()->empty() &&
         Invented->getDepth() == TemplateParamLists.back()->getDepth())
       TemplateParamLists.back() = Invented;
     else
@@ -13049,7 +13070,8 @@ QualType Sema::deduceVarTypeFromInitializer(VarDecl *VDecl,
   TemplateDeductionInfo Info(DeduceInit->getExprLoc());
   TemplateDeductionResult Result =
       DeduceAutoType(TSI->getTypeLoc(), DeduceInit, DeducedType, Info);
-  if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed) {
+  if (Result != TemplateDeductionResult::Success &&
+      Result != TemplateDeductionResult::AlreadyDiagnosed) {
     if (!IsInitCapture)
       DiagnoseAutoDeductionFailure(VDecl, DeduceInit);
     else if (isa<InitListExpr>(Init))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index d785714c4d81..d5526957937b 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3501,9 +3501,16 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   return false;
 }
 
+static bool hasArmStreamingInterface(const FunctionDecl *FD) {
+  if (const auto *T = FD->getType()->getAs<FunctionProtoType>())
+    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)
+      return true;
+  return false;
+}
+
 // Check Target Version attrs
-bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &AttrStr,
-                                  bool &isDefault) {
+bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
+                                  StringRef &AttrStr, bool &isDefault) {
   enum FirstParam { Unsupported };
   enum SecondParam { None };
   enum ThirdParam { Target, TargetClones, TargetVersion };
@@ -3519,6 +3526,8 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, StringRef &AttrStr,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
+  if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+    return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
 
@@ -3527,7 +3536,7 @@ static void handleTargetVersionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   SourceLocation LiteralLoc;
   bool isDefault = false;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &LiteralLoc) ||
-      S.checkTargetVersionAttr(LiteralLoc, Str, isDefault))
+      S.checkTargetVersionAttr(LiteralLoc, D, Str, isDefault))
     return;
   // Do not create default only target_version attribute
   if (!isDefault) {
@@ -3550,7 +3559,7 @@ static void handleTargetAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 bool Sema::checkTargetClonesAttrString(
     SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
-    bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
+    Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
     SmallVectorImpl<SmallString<64>> &StringsBuffer) {
   enum FirstParam { Unsupported, Duplicate, Unknown };
   enum SecondParam { None, CPU, Tune };
@@ -3619,6 +3628,9 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
+      if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+        return Diag(LiteralLoc,
+                    diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
       // Other targets ( currently X86 )
       if (Cur.starts_with("arch=")) {
@@ -3670,7 +3682,7 @@ static void handleTargetClonesAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     if (!S.checkStringLiteralArgumentAttr(AL, I, CurStr, &LiteralLoc) ||
         S.checkTargetClonesAttrString(
             LiteralLoc, CurStr,
-            cast<StringLiteral>(AL.getArgAsExpr(I)->IgnoreParenCasts()),
+            cast<StringLiteral>(AL.getArgAsExpr(I)->IgnoreParenCasts()), D,
             HasDefault, HasCommas, HasNotDefault, StringsBuffer))
       return;
   }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index ab8a967b06a4..ba233c9e2f35 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -5998,6 +5998,10 @@ void Sema::ActOnDefaultCtorInitializers(Decl *CDtorDecl) {
 
   if (CXXConstructorDecl *Constructor
       = dyn_cast<CXXConstructorDecl>(CDtorDecl)) {
+    if (CXXRecordDecl *ClassDecl = Constructor->getParent();
+        !ClassDecl || ClassDecl->isInvalidDecl()) {
+      return;
+    }
     SetCtorInitializers(Constructor, /*AnyErrors=*/false);
     DiagnoseUninitializedFields(*this, Constructor);
   }
@@ -14038,6 +14042,9 @@ void Sema::DefineImplicitDefaultConstructor(SourceLocation CurrentLocation,
 
   CXXRecordDecl *ClassDecl = Constructor->getParent();
   assert(ClassDecl && "DefineImplicitDefaultConstructor - invalid constructor");
+  if (ClassDecl->isInvalidDecl()) {
+    return;
+  }
 
   SynthesizedFunctionScope Scope(*this, Constructor);
 
@@ -19294,7 +19301,16 @@ void Sema::ActOnStartFunctionDeclarationDeclarator(
         ExplicitLists, /*IsFriend=*/false, IsMemberSpecialization, IsInvalid,
         /*SuppressDiagnostic=*/true);
   }
-  if (ExplicitParams) {
+  // C++23 [dcl.fct]p23:
+  //   An abbreviated function template can have a template-head. The invented
+  //   template-parameters are appended to the template-parameter-list after
+  //   the explicitly declared template-parameters.
+  //
+  // A template-head must have one or more template-parameters (read:
+  // 'template<>' is *not* a template-head). Only append the invented
+  // template parameters if we matched the nested-name-specifier to a non-empty
+  // TemplateParameterList.
+  if (ExplicitParams && !ExplicitParams->empty()) {
     Info.AutoTemplateParameterDepth = ExplicitParams->getDepth();
     llvm::append_range(Info.TemplateParams, *ExplicitParams);
     Info.NumExplicitTemplateParams = ExplicitParams->size();
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 8d58ef5ee16d..3563b4f683f0 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1423,6 +1423,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
     llvm_unreachable("Invalid class for expression");
 
     // Most statements can throw if any substatement can throw.
+  case Stmt::OpenACCComputeConstructClass:
   case Stmt::AttributedStmtClass:
   case Stmt::BreakStmtClass:
   case Stmt::CapturedStmtClass:
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 246d2313e089..f2b89135af21 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1554,12 +1554,13 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
     TemplateDeductionInfo Info(Deduce->getExprLoc());
     TemplateDeductionResult Result =
         DeduceAutoType(TInfo->getTypeLoc(), Deduce, DeducedType, Info);
-    if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       return ExprError(Diag(TyBeginLoc, diag::err_auto_expr_deduction_failure)
                        << Ty << Deduce->getType() << FullRange
                        << Deduce->getSourceRange());
     if (DeducedType.isNull()) {
-      assert(Result == TDK_AlreadyDiagnosed);
+      assert(Result == TemplateDeductionResult::AlreadyDiagnosed);
       return ExprError();
     }
 
@@ -2098,12 +2099,13 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
     TemplateDeductionInfo Info(Deduce->getExprLoc());
     TemplateDeductionResult Result =
         DeduceAutoType(AllocTypeInfo->getTypeLoc(), Deduce, DeducedType, Info);
-    if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       return ExprError(Diag(StartLoc, diag::err_auto_new_deduction_failure)
                        << AllocType << Deduce->getType() << TypeRange
                        << Deduce->getSourceRange());
     if (DeducedType.isNull()) {
-      assert(Result == TDK_AlreadyDiagnosed);
+      assert(Result == TemplateDeductionResult::AlreadyDiagnosed);
       return ExprError();
     }
     AllocType = DeducedType;
@@ -2883,7 +2885,7 @@ bool Sema::FindAllocationFunctions(SourceLocation StartLoc, SourceRange Range,
         // expected function type.
         TemplateDeductionInfo Info(StartLoc);
         if (DeduceTemplateArguments(FnTmpl, nullptr, ExpectedFunctionType, Fn,
-                                    Info))
+                                    Info) != TemplateDeductionResult::Success)
           continue;
       } else
         Fn = cast<FunctionDecl>((*D)->getUnderlyingDecl());
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 02b1a045df44..d3a9c7abd0e9 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1200,8 +1200,8 @@ static bool LookupDirect(Sema &S, LookupResult &R, const DeclContext *DC) {
     // Perform template argument deduction against the type that we would
     // expect the function to have.
     if (R.getSema().DeduceTemplateArguments(ConvTemplate, nullptr, ExpectedType,
-                                            Specialization, Info)
-          == Sema::TDK_Success) {
+                                            Specialization, Info) ==
+        TemplateDeductionResult::Success) {
       R.addDecl(Specialization);
       Found = true;
     }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 42960c229077..9381b8c6626b 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -628,28 +628,28 @@ namespace {
 /// to the form used in overload-candidate information.
 DeductionFailureInfo
 clang::MakeDeductionFailureInfo(ASTContext &Context,
-                                Sema::TemplateDeductionResult TDK,
+                                TemplateDeductionResult TDK,
                                 TemplateDeductionInfo &Info) {
   DeductionFailureInfo Result;
   Result.Result = static_cast<unsigned>(TDK);
   Result.HasDiagnostic = false;
   switch (TDK) {
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
     Result.Data = nullptr;
     break;
 
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     Result.Data = Info.Param.getOpaqueValue();
     break;
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested: {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     auto *Saved = new (Context) DFIDeducedMismatchArgs;
     Saved->FirstArg = Info.FirstArg;
@@ -660,7 +660,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_NonDeducedMismatch: {
+  case TemplateDeductionResult::NonDeducedMismatch: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     DFIArguments *Saved = new (Context) DFIArguments;
     Saved->FirstArg = Info.FirstArg;
@@ -669,10 +669,10 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_IncompletePack:
+  case TemplateDeductionResult::IncompletePack:
     // FIXME: It's slightly wasteful to allocate two TemplateArguments for this.
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified: {
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     DFIParamWithArguments *Saved = new (Context) DFIParamWithArguments;
     Saved->Param = Info.Param;
@@ -682,7 +682,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     Result.Data = Info.takeSugared();
     if (Info.hasSFINAEDiagnostic()) {
       PartialDiagnosticAt *Diag = new (Result.Diagnostic) PartialDiagnosticAt(
@@ -692,7 +692,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     }
     break;
 
-  case Sema::TDK_ConstraintsNotSatisfied: {
+  case TemplateDeductionResult::ConstraintsNotSatisfied: {
     CNSInfo *Saved = new (Context) CNSInfo;
     Saved->TemplateArgs = Info.takeSugared();
     Saved->Satisfaction = Info.AssociatedConstraintsSatisfaction;
@@ -700,9 +700,9 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     break;
   }
 
-  case Sema::TDK_Success:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     llvm_unreachable("not a deduction failure");
   }
 
@@ -710,29 +710,29 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
 }
 
 void DeductionFailureInfo::Destroy() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
     break;
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     // FIXME: Destroy the data?
     Data = nullptr;
     break;
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     // FIXME: Destroy the template argument list?
     Data = nullptr;
     if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
@@ -741,7 +741,7 @@ void DeductionFailureInfo::Destroy() {
     }
     break;
 
-  case Sema::TDK_ConstraintsNotSatisfied:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     // FIXME: Destroy the template argument list?
     Data = nullptr;
     if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
@@ -751,8 +751,8 @@ void DeductionFailureInfo::Destroy() {
     break;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 }
@@ -764,33 +764,33 @@ PartialDiagnosticAt *DeductionFailureInfo::getSFINAEDiagnostic() {
 }
 
 TemplateParameter DeductionFailureInfo::getTemplateParameter() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return TemplateParameter();
 
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     return TemplateParameter::getFromOpaqueValue(Data);
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
     return static_cast<DFIParamWithArguments*>(Data)->Param;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -798,35 +798,35 @@ TemplateParameter DeductionFailureInfo::getTemplateParameter() {
 }
 
 TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
     return nullptr;
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->TemplateArgs;
 
-  case Sema::TDK_SubstitutionFailure:
+  case TemplateDeductionResult::SubstitutionFailure:
     return static_cast<TemplateArgumentList*>(Data);
 
-  case Sema::TDK_ConstraintsNotSatisfied:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return static_cast<CNSInfo*>(Data)->TemplateArgs;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -834,31 +834,31 @@ TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
 }
 
 const TemplateArgument *DeductionFailureInfo::getFirstArg() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return nullptr;
 
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->FirstArg;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -866,31 +866,31 @@ const TemplateArgument *DeductionFailureInfo::getFirstArg() {
 }
 
 const TemplateArgument *DeductionFailureInfo::getSecondArg() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_Success:
-  case Sema::TDK_Invalid:
-  case Sema::TDK_InstantiationDepth:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
-  case Sema::TDK_InvalidExplicitArguments:
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_CUDATargetMismatch:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_ConstraintsNotSatisfied:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::InstantiationDepth:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
     return nullptr;
 
-  case Sema::TDK_Inconsistent:
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
+  case TemplateDeductionResult::Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->SecondArg;
 
   // Unhandled
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     break;
   }
 
@@ -898,9 +898,9 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() {
 }
 
 std::optional<unsigned> DeductionFailureInfo::getCallArgIndex() {
-  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested:
+  switch (static_cast<TemplateDeductionResult>(Result)) {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->CallArgIndex;
 
   default:
@@ -7548,12 +7548,14 @@ void Sema::AddMethodTemplateCandidate(
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info,
           PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType,
-          ObjectClassification, [&](ArrayRef<QualType> ParamTypes) {
+          ObjectClassification,
+          [&](ArrayRef<QualType> ParamTypes) {
             return CheckNonDependentConversions(
                 MethodTmpl, ParamTypes, Args, CandidateSet, Conversions,
                 SuppressUserConversions, ActingContext, ObjectType,
                 ObjectClassification, PO);
-          })) {
+          });
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate =
         CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
@@ -7566,7 +7568,7 @@ void Sema::AddMethodTemplateCandidate(
         cast<CXXMethodDecl>(Candidate.Function)->isStatic() ||
         ObjectType.isNull();
     Candidate.ExplicitCallArguments = Args.size();
-    if (Result == TDK_NonDependentConversionFailure)
+    if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
     else {
       Candidate.FailureKind = ovl_fail_bad_deduction;
@@ -7639,7 +7641,8 @@ void Sema::AddTemplateOverloadCandidate(
             return CheckNonDependentConversions(
                 FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions,
                 SuppressUserConversions, nullptr, QualType(), {}, PO);
-          })) {
+          });
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate =
         CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
@@ -7655,7 +7658,7 @@ void Sema::AddTemplateOverloadCandidate(
         isa<CXXMethodDecl>(Candidate.Function) &&
         !isa<CXXConstructorDecl>(Candidate.Function);
     Candidate.ExplicitCallArguments = Args.size();
-    if (Result == TDK_NonDependentConversionFailure)
+    if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
     else {
       Candidate.FailureKind = ovl_fail_bad_deduction;
@@ -8042,7 +8045,8 @@ void Sema::AddTemplateConversionCandidate(
   CXXConversionDecl *Specialization = nullptr;
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           FunctionTemplate, ObjectType, ObjectClassification, ToType,
-          Specialization, Info)) {
+          Specialization, Info);
+      Result != TemplateDeductionResult::Success) {
     OverloadCandidate &Candidate = CandidateSet.addCandidate();
     Candidate.FoundDecl = FoundDecl;
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
@@ -10655,7 +10659,8 @@ void Sema::diagnoseEquivalentInternalLinkageDeclarations(
 
 bool OverloadCandidate::NotValidBecauseConstraintExprHasError() const {
   return FailureKind == ovl_fail_bad_deduction &&
-         DeductionFailure.Result == Sema::TDK_ConstraintsNotSatisfied &&
+         static_cast<TemplateDeductionResult>(DeductionFailure.Result) ==
+             TemplateDeductionResult::ConstraintsNotSatisfied &&
          static_cast<CNSInfo *>(DeductionFailure.Data)
              ->Satisfaction.ContainsErrors;
 }
@@ -11358,11 +11363,13 @@ static bool CheckArityMismatch(Sema &S, OverloadCandidate *Cand,
   if (NumArgs < MinParams) {
     assert((Cand->FailureKind == ovl_fail_too_few_arguments) ||
            (Cand->FailureKind == ovl_fail_bad_deduction &&
-            Cand->DeductionFailure.Result == Sema::TDK_TooFewArguments));
+            Cand->DeductionFailure.getResult() ==
+                TemplateDeductionResult::TooFewArguments));
   } else {
     assert((Cand->FailureKind == ovl_fail_too_many_arguments) ||
            (Cand->FailureKind == ovl_fail_bad_deduction &&
-            Cand->DeductionFailure.Result == Sema::TDK_TooManyArguments));
+            Cand->DeductionFailure.getResult() ==
+                TemplateDeductionResult::TooManyArguments));
   }
 
   return false;
@@ -11445,11 +11452,18 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
   (ParamD = Param.dyn_cast<TemplateTypeParmDecl*>()) ||
   (ParamD = Param.dyn_cast<NonTypeTemplateParmDecl*>()) ||
   (ParamD = Param.dyn_cast<TemplateTemplateParmDecl*>());
-  switch (DeductionFailure.Result) {
-  case Sema::TDK_Success:
-    llvm_unreachable("TDK_success while diagnosing bad deduction");
+  switch (DeductionFailure.getResult()) {
+  case TemplateDeductionResult::Success:
+    llvm_unreachable(
+        "TemplateDeductionResult::Success while diagnosing bad deduction");
+  case TemplateDeductionResult::NonDependentConversionFailure:
+    llvm_unreachable("TemplateDeductionResult::NonDependentConversionFailure "
+                     "while diagnosing bad deduction");
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::AlreadyDiagnosed:
+    return;
 
-  case Sema::TDK_Incomplete: {
+  case TemplateDeductionResult::Incomplete: {
     assert(ParamD && "no parameter found for incomplete deduction result");
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_incomplete_deduction)
@@ -11458,7 +11472,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_IncompletePack: {
+  case TemplateDeductionResult::IncompletePack: {
     assert(ParamD && "no parameter found for incomplete deduction result");
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_incomplete_deduction_pack)
@@ -11469,7 +11483,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_Underqualified: {
+  case TemplateDeductionResult::Underqualified: {
     assert(ParamD && "no parameter found for bad qualifiers deduction result");
     TemplateTypeParmDecl *TParam = cast<TemplateTypeParmDecl>(ParamD);
 
@@ -11494,7 +11508,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_Inconsistent: {
+  case TemplateDeductionResult::Inconsistent: {
     assert(ParamD && "no parameter found for inconsistent deduction result");
     int which = 0;
     if (isa<TemplateTypeParmDecl>(ParamD))
@@ -11539,7 +11553,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     assert(ParamD && "no parameter found for invalid explicit arguments");
     if (ParamD->getDeclName())
       S.Diag(Templated->getLocation(),
@@ -11561,7 +11575,7 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
-  case Sema::TDK_ConstraintsNotSatisfied: {
+  case TemplateDeductionResult::ConstraintsNotSatisfied: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     TemplateArgumentList *Args = DeductionFailure.getTemplateArgumentList();
@@ -11578,18 +11592,18 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
         static_cast<CNSInfo*>(DeductionFailure.Data)->Satisfaction);
     return;
   }
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
     DiagnoseArityMismatch(S, Found, Templated, NumArgs);
     return;
 
-  case Sema::TDK_InstantiationDepth:
+  case TemplateDeductionResult::InstantiationDepth:
     S.Diag(Templated->getLocation(),
            diag::note_ovl_candidate_instantiation_depth);
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
 
-  case Sema::TDK_SubstitutionFailure: {
+  case TemplateDeductionResult::SubstitutionFailure: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     if (TemplateArgumentList *Args =
@@ -11639,8 +11653,8 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_DeducedMismatchNested: {
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::DeducedMismatchNested: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     if (TemplateArgumentList *Args =
@@ -11656,11 +11670,12 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
         << (*DeductionFailure.getCallArgIndex() + 1)
         << *DeductionFailure.getFirstArg() << *DeductionFailure.getSecondArg()
         << TemplateArgString
-        << (DeductionFailure.Result == Sema::TDK_DeducedMismatchNested);
+        << (DeductionFailure.getResult() ==
+            TemplateDeductionResult::DeducedMismatchNested);
     break;
   }
 
-  case Sema::TDK_NonDeducedMismatch: {
+  case TemplateDeductionResult::NonDeducedMismatch: {
     // FIXME: Provide a source location to indicate what we couldn't match.
     TemplateArgument FirstTA = *DeductionFailure.getFirstArg();
     TemplateArgument SecondTA = *DeductionFailure.getSecondArg();
@@ -11701,11 +11716,11 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
   }
   // TODO: diagnose these individually, then kill off
   // note_ovl_candidate_bad_deduction, which is uselessly vague.
-  case Sema::TDK_MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
     S.Diag(Templated->getLocation(), diag::note_ovl_candidate_bad_deduction);
     MaybeEmitInheritedConstructorNote(S, Found);
     return;
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::CUDATargetMismatch:
     S.Diag(Templated->getLocation(),
            diag::note_cuda_ovl_candidate_target_mismatch);
     return;
@@ -11716,8 +11731,9 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
 static void DiagnoseBadDeduction(Sema &S, OverloadCandidate *Cand,
                                  unsigned NumArgs,
                                  bool TakingCandidateAddress) {
-  unsigned TDK = Cand->DeductionFailure.Result;
-  if (TDK == Sema::TDK_TooFewArguments || TDK == Sema::TDK_TooManyArguments) {
+  TemplateDeductionResult TDK = Cand->DeductionFailure.getResult();
+  if (TDK == TemplateDeductionResult::TooFewArguments ||
+      TDK == TemplateDeductionResult::TooManyArguments) {
     if (CheckArityMismatch(S, Cand, NumArgs))
       return;
   }
@@ -12051,38 +12067,38 @@ static SourceLocation GetLocationForCandidate(const OverloadCandidate *Cand) {
 }
 
 static unsigned RankDeductionFailure(const DeductionFailureInfo &DFI) {
-  switch ((Sema::TemplateDeductionResult)DFI.Result) {
-  case Sema::TDK_Success:
-  case Sema::TDK_NonDependentConversionFailure:
-  case Sema::TDK_AlreadyDiagnosed:
+  switch (static_cast<TemplateDeductionResult>(DFI.Result)) {
+  case TemplateDeductionResult::Success:
+  case TemplateDeductionResult::NonDependentConversionFailure:
+  case TemplateDeductionResult::AlreadyDiagnosed:
     llvm_unreachable("non-deduction failure while diagnosing bad deduction");
 
-  case Sema::TDK_Invalid:
-  case Sema::TDK_Incomplete:
-  case Sema::TDK_IncompletePack:
+  case TemplateDeductionResult::Invalid:
+  case TemplateDeductionResult::Incomplete:
+  case TemplateDeductionResult::IncompletePack:
     return 1;
 
-  case Sema::TDK_Underqualified:
-  case Sema::TDK_Inconsistent:
+  case TemplateDeductionResult::Underqualified:
+  case TemplateDeductionResult::Inconsistent:
     return 2;
 
-  case Sema::TDK_SubstitutionFailure:
-  case Sema::TDK_DeducedMismatch:
-  case Sema::TDK_ConstraintsNotSatisfied:
-  case Sema::TDK_DeducedMismatchNested:
-  case Sema::TDK_NonDeducedMismatch:
-  case Sema::TDK_MiscellaneousDeductionFailure:
-  case Sema::TDK_CUDATargetMismatch:
+  case TemplateDeductionResult::SubstitutionFailure:
+  case TemplateDeductionResult::DeducedMismatch:
+  case TemplateDeductionResult::ConstraintsNotSatisfied:
+  case TemplateDeductionResult::DeducedMismatchNested:
+  case TemplateDeductionResult::NonDeducedMismatch:
+  case TemplateDeductionResult::MiscellaneousDeductionFailure:
+  case TemplateDeductionResult::CUDATargetMismatch:
     return 3;
 
-  case Sema::TDK_InstantiationDepth:
+  case TemplateDeductionResult::InstantiationDepth:
     return 4;
 
-  case Sema::TDK_InvalidExplicitArguments:
+  case TemplateDeductionResult::InvalidExplicitArguments:
     return 5;
 
-  case Sema::TDK_TooManyArguments:
-  case Sema::TDK_TooFewArguments:
+  case TemplateDeductionResult::TooManyArguments:
+  case TemplateDeductionResult::TooFewArguments:
     return 6;
   }
   llvm_unreachable("Unhandled deduction result");
@@ -12810,11 +12826,10 @@ private:
     //   overloaded functions considered.
     FunctionDecl *Specialization = nullptr;
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
-    if (Sema::TemplateDeductionResult Result
-          = S.DeduceTemplateArguments(FunctionTemplate,
-                                      &OvlExplicitTemplateArgs,
-                                      TargetFunctionType, Specialization,
-                                      Info, /*IsAddressOfFunction*/true)) {
+    if (TemplateDeductionResult Result = S.DeduceTemplateArguments(
+            FunctionTemplate, &OvlExplicitTemplateArgs, TargetFunctionType,
+            Specialization, Info, /*IsAddressOfFunction*/ true);
+        Result != TemplateDeductionResult::Success) {
       // Make a note of the failed deduction for diagnostics.
       FailedCandidates.addCandidate()
           .set(CurAccessFunPair, FunctionTemplate->getTemplatedDecl(),
@@ -13305,10 +13320,10 @@ FunctionDecl *Sema::ResolveSingleFunctionTemplateSpecialization(
     //   overloaded functions considered.
     FunctionDecl *Specialization = nullptr;
     TemplateDeductionInfo Info(ovl->getNameLoc());
-    if (TemplateDeductionResult Result
-          = DeduceTemplateArguments(FunctionTemplate, &ExplicitTemplateArgs,
-                                    Specialization, Info,
-                                    /*IsAddressOfFunction*/true)) {
+    if (TemplateDeductionResult Result = DeduceTemplateArguments(
+            FunctionTemplate, &ExplicitTemplateArgs, Specialization, Info,
+            /*IsAddressOfFunction*/ true);
+        Result != TemplateDeductionResult::Success) {
       // Make a note of the failed deduction for diagnostics.
       if (FailedTSC)
         FailedTSC->addCandidate().set(
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 5ab253449211..dde3bd84e89f 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -2326,7 +2326,8 @@ Sema::ActOnObjCForCollectionStmt(SourceLocation ForLoc,
         FirstType = QualType();
         TemplateDeductionResult Result = DeduceAutoType(
             D->getTypeSourceInfo()->getTypeLoc(), DeducedInit, FirstType, Info);
-        if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed)
+        if (Result != TemplateDeductionResult::Success &&
+            Result != TemplateDeductionResult::AlreadyDiagnosed)
           DiagnoseAutoDeductionFailure(D, DeducedInit);
         if (FirstType.isNull()) {
           D->setInvalidDecl();
@@ -2394,9 +2395,10 @@ static bool FinishForRangeVarDecl(Sema &SemaRef, VarDecl *Decl, Expr *Init,
     SemaRef.Diag(Loc, DiagID) << Init->getType();
   } else {
     TemplateDeductionInfo Info(Init->getExprLoc());
-    Sema::TemplateDeductionResult Result = SemaRef.DeduceAutoType(
+    TemplateDeductionResult Result = SemaRef.DeduceAutoType(
         Decl->getTypeSourceInfo()->getTypeLoc(), Init, InitType, Info);
-    if (Result != Sema::TDK_Success && Result != Sema::TDK_AlreadyDiagnosed)
+    if (Result != TemplateDeductionResult::Success &&
+        Result != TemplateDeductionResult::AlreadyDiagnosed)
       SemaRef.Diag(Loc, DiagID) << Init->getType();
   }
 
@@ -3865,14 +3867,14 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
     TemplateDeductionResult Res = DeduceAutoType(
         OrigResultType, RetExpr, Deduced, Info, /*DependentDeduction=*/false,
         /*IgnoreConstraints=*/false, &FailedTSC);
-    if (Res != TDK_Success && FD->isInvalidDecl())
+    if (Res != TemplateDeductionResult::Success && FD->isInvalidDecl())
       return true;
     switch (Res) {
-    case TDK_Success:
+    case TemplateDeductionResult::Success:
       break;
-    case TDK_AlreadyDiagnosed:
+    case TemplateDeductionResult::AlreadyDiagnosed:
       return true;
-    case TDK_Inconsistent: {
+    case TemplateDeductionResult::Inconsistent: {
       //  If a function with a declared return type that contains a placeholder
       //  type has multiple return statements, the return type is deduced for
       //  each return statement. [...] if the type deduced is not the same in
@@ -4381,6 +4383,7 @@ Sema::ActOnObjCAutoreleasePoolStmt(SourceLocation AtLoc, Stmt *Body) {
 namespace {
 class CatchHandlerType {
   QualType QT;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IsPointer : 1;
 
   // This is a special constructor to be used only with DenseMapInfo's
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index cf781e0e1bf3..9e516da2aa27 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4863,7 +4863,8 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
 
     if (TemplateDeductionResult Result =
-            DeduceTemplateArguments(Partial, CanonicalConverted, Info)) {
+            DeduceTemplateArguments(Partial, CanonicalConverted, Info);
+        Result != TemplateDeductionResult::Success) {
       // Store the failed-deduction information for use in diagnostics, later.
       // TODO: Actually use the failed-deduction info?
       FailedCandidates.addCandidate().set(
@@ -7243,10 +7244,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
                          // along with the other associated constraints after
                          // checking the template argument list.
                          /*IgnoreConstraints=*/true);
-      if (Result == TDK_AlreadyDiagnosed) {
+      if (Result == TemplateDeductionResult::AlreadyDiagnosed) {
         if (ParamType.isNull())
           return ExprError();
-      } else if (Result != TDK_Success) {
+      } else if (Result != TemplateDeductionResult::Success) {
         Diag(Arg->getExprLoc(),
              diag::err_non_type_template_parm_type_deduction_failure)
             << Param->getDeclName() << Param->getType() << Arg->getType()
@@ -9644,8 +9645,8 @@ bool Sema::CheckFunctionTemplateSpecialization(
       FunctionDecl *Specialization = nullptr;
       if (TemplateDeductionResult TDK = DeduceTemplateArguments(
               cast<FunctionTemplateDecl>(FunTmpl->getFirstDecl()),
-              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization,
-              Info)) {
+              ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization, Info);
+          TDK != TemplateDeductionResult::Success) {
         // Template argument deduction failed; record why it failed, so
         // that we can provide nifty diagnostics.
         FailedCandidates.addCandidate().set(
@@ -9666,7 +9667,8 @@ bool Sema::CheckFunctionTemplateSpecialization(
               IdentifyCUDATarget(FD, /* IgnoreImplicitHDAttr = */ true)) {
         FailedCandidates.addCandidate().set(
             I.getPair(), FunTmpl->getTemplatedDecl(),
-            MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
+            MakeDeductionFailureInfo(
+                Context, TemplateDeductionResult::CUDATargetMismatch, Info));
         continue;
       }
 
@@ -10816,11 +10818,10 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
 
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
     FunctionDecl *Specialization = nullptr;
-    if (TemplateDeductionResult TDK
-          = DeduceTemplateArguments(FunTmpl,
-                               (HasExplicitTemplateArgs ? &TemplateArgs
-                                                        : nullptr),
-                                    R, Specialization, Info)) {
+    if (TemplateDeductionResult TDK = DeduceTemplateArguments(
+            FunTmpl, (HasExplicitTemplateArgs ? &TemplateArgs : nullptr), R,
+            Specialization, Info);
+        TDK != TemplateDeductionResult::Success) {
       // Keep track of almost-matches.
       FailedCandidates.addCandidate()
           .set(P.getPair(), FunTmpl->getTemplatedDecl(),
@@ -10840,7 +10841,8 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
             IdentifyCUDATarget(D.getDeclSpec().getAttributes())) {
       FailedCandidates.addCandidate().set(
           P.getPair(), FunTmpl->getTemplatedDecl(),
-          MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
+          MakeDeductionFailureInfo(
+              Context, TemplateDeductionResult::CUDATargetMismatch, Info));
       continue;
     }
 
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index a54ad2797589..47cc22310c4e 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -133,13 +133,13 @@ static bool hasSameExtendedValue(llvm::APSInt X, llvm::APSInt Y) {
   return X == Y;
 }
 
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
+static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     Sema &S, TemplateParameterList *TemplateParams, QualType Param,
     QualType Arg, TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced, unsigned TDF,
     bool PartialOrdering = false, bool DeducedFromArrayBound = false);
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         ArrayRef<TemplateArgument> Ps,
                         ArrayRef<TemplateArgument> As,
@@ -393,10 +393,11 @@ checkDeducedTemplateArguments(ASTContext &Context,
 /// Deduce the value of the given non-type template parameter
 /// as the given deduced template argument. All non-type template parameter
 /// deduction is funneled through here.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
-    const NonTypeTemplateParmDecl *NTTP, const DeducedTemplateArgument &NewDeduced,
-    QualType ValueType, TemplateDeductionInfo &Info,
+    const NonTypeTemplateParmDecl *NTTP,
+    const DeducedTemplateArgument &NewDeduced, QualType ValueType,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   assert(NTTP->getDepth() == Info.getDeducedDepth() &&
          "deducing non-type template argument with wrong depth");
@@ -407,19 +408,19 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Info.Param = const_cast<NonTypeTemplateParmDecl*>(NTTP);
     Info.FirstArg = Deduced[NTTP->getIndex()];
     Info.SecondArg = NewDeduced;
-    return Sema::TDK_Inconsistent;
+    return TemplateDeductionResult::Inconsistent;
   }
 
   Deduced[NTTP->getIndex()] = Result;
   if (!S.getLangOpts().CPlusPlus17)
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   if (NTTP->isExpandedParameterPack())
     // FIXME: We may still need to deduce parts of the type here! But we
     // don't have any way to find which slice of the type to use, and the
     // type stored on the NTTP itself is nonsense. Perhaps the type of an
     // expanded NTTP should be a pack expansion type?
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Get the type of the parameter for deduction. If it's a (dependent) array
   // or function type, we will not have decayed it yet, so do that now.
@@ -446,7 +447,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 
 /// Deduce the value of the given non-type template parameter
 /// from the given integral constant.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, const llvm::APSInt &Value,
     QualType ValueType, bool DeducedFromArrayBound, TemplateDeductionInfo &Info,
@@ -460,7 +461,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 
 /// Deduce the value of the given non-type template parameter
 /// from the given null pointer template argument type.
-static Sema::TemplateDeductionResult DeduceNullPtrTemplateArgument(
+static TemplateDeductionResult DeduceNullPtrTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, QualType NullPtrType,
     TemplateDeductionInfo &Info,
@@ -481,9 +482,10 @@ static Sema::TemplateDeductionResult DeduceNullPtrTemplateArgument(
 /// from the given type- or value-dependent expression.
 ///
 /// \returns true if deduction succeeded, false otherwise.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
-    const NonTypeTemplateParmDecl *NTTP, Expr *Value, TemplateDeductionInfo &Info,
+    const NonTypeTemplateParmDecl *NTTP, Expr *Value,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                        DeducedTemplateArgument(Value),
@@ -494,7 +496,7 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
 /// from the given declaration.
 ///
 /// \returns true if deduction succeeded, false otherwise.
-static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
+static TemplateDeductionResult DeduceNonTypeTemplateArgument(
     Sema &S, TemplateParameterList *TemplateParams,
     const NonTypeTemplateParmDecl *NTTP, ValueDecl *D, QualType T,
     TemplateDeductionInfo &Info,
@@ -505,25 +507,23 @@ static Sema::TemplateDeductionResult DeduceNonTypeTemplateArgument(
       S, TemplateParams, NTTP, DeducedTemplateArgument(New), T, Info, Deduced);
 }
 
-static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
-                        TemplateName Param,
-                        TemplateName Arg,
+static TemplateDeductionResult
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
+                        TemplateName Param, TemplateName Arg,
                         TemplateDeductionInfo &Info,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   TemplateDecl *ParamDecl = Param.getAsTemplateDecl();
   if (!ParamDecl) {
     // The parameter type is dependent and is not a template template parameter,
     // so there is nothing that we can deduce.
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   if (TemplateTemplateParmDecl *TempParam
         = dyn_cast<TemplateTemplateParmDecl>(ParamDecl)) {
     // If we're not deducing at this depth, there's nothing to deduce.
     if (TempParam->getDepth() != Info.getDeducedDepth())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     DeducedTemplateArgument NewDeduced(S.Context.getCanonicalTemplateName(Arg));
     DeducedTemplateArgument Result = checkDeducedTemplateArguments(S.Context,
@@ -533,21 +533,21 @@ DeduceTemplateArguments(Sema &S,
       Info.Param = TempParam;
       Info.FirstArg = Deduced[TempParam->getIndex()];
       Info.SecondArg = NewDeduced;
-      return Sema::TDK_Inconsistent;
+      return TemplateDeductionResult::Inconsistent;
     }
 
     Deduced[TempParam->getIndex()] = Result;
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Verify that the two template names are equivalent.
   if (S.Context.hasSameTemplateName(Param, Arg))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Mismatch of non-dependent template parameter to argument.
   Info.FirstArg = TemplateArgument(Param);
   Info.SecondArg = TemplateArgument(Arg);
-  return Sema::TDK_NonDeducedMismatch;
+  return TemplateDeductionResult::NonDeducedMismatch;
 }
 
 /// Deduce the template arguments by comparing the template parameter
@@ -568,7 +568,7 @@ DeduceTemplateArguments(Sema &S,
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
                             const QualType P, QualType A,
                             TemplateDeductionInfo &Info,
@@ -583,7 +583,7 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
 
   // If the parameter is an alias template, there is nothing to deduce.
   if (const auto *TD = TNP.getAsTemplateDecl(); TD && TD->isTypeAlias())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   ArrayRef<TemplateArgument> PResolved = TP->template_arguments();
 
@@ -600,11 +600,12 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // If the argument is an alias template, there is nothing to deduce.
     if (const auto *TD = TNA.getAsTemplateDecl(); TD && TD->isTypeAlias())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     // Perform template argument deduction for the template name.
     if (auto Result =
-            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info, Deduced))
+            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info, Deduced);
+        Result != TemplateDeductionResult::Success)
       return Result;
     // Perform template argument deduction on each template
     // argument. Ignore any missing/extra arguments, since they could be
@@ -623,13 +624,14 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
   if (!SA) {
     Info.FirstArg = TemplateArgument(P);
     Info.SecondArg = TemplateArgument(A);
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
   }
 
   // Perform template argument deduction for the template name.
   if (auto Result = DeduceTemplateArguments(
           S, TemplateParams, TP->getTemplateName(),
-          TemplateName(SA->getSpecializedTemplate()), Info, Deduced))
+          TemplateName(SA->getSpecializedTemplate()), Info, Deduced);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Perform template argument deduction for the template arguments.
@@ -919,7 +921,7 @@ public:
   /// Finish template argument deduction for a set of argument packs,
   /// producing the argument packs and checking for consistency with prior
   /// deductions.
-  Sema::TemplateDeductionResult finish() {
+  TemplateDeductionResult finish() {
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
     for (auto &Pack : Packs) {
@@ -996,7 +998,7 @@ public:
         Info.Param = makeTemplateParameter(Param);
         Info.FirstArg = OldPack;
         Info.SecondArg = NewPack;
-        return Sema::TDK_Inconsistent;
+        return TemplateDeductionResult::Inconsistent;
       }
 
       // If we have a pre-expanded pack and we didn't deduce enough elements
@@ -1005,14 +1007,14 @@ public:
         if (*Expansions != PackElements) {
           Info.Param = makeTemplateParameter(Param);
           Info.FirstArg = Result;
-          return Sema::TDK_IncompletePack;
+          return TemplateDeductionResult::IncompletePack;
         }
       }
 
       *Loc = Result;
     }
 
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
 private:
@@ -1062,15 +1064,13 @@ private:
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult
-DeduceTemplateArguments(Sema &S,
-                        TemplateParameterList *TemplateParams,
+static TemplateDeductionResult
+DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const QualType *Params, unsigned NumParams,
                         const QualType *Args, unsigned NumArgs,
                         TemplateDeductionInfo &Info,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                        unsigned TDF,
-                        bool PartialOrdering = false) {
+                        unsigned TDF, bool PartialOrdering = false) {
   // C++0x [temp.deduct.type]p10:
   //   Similarly, if P has a form that contains (T), then each parameter type
   //   Pi of the respective parameter-type- list of P is compared with the
@@ -1086,22 +1086,22 @@ DeduceTemplateArguments(Sema &S,
 
       // Make sure we have an argument.
       if (ArgIdx >= NumArgs)
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
       if (isa<PackExpansionType>(Args[ArgIdx])) {
         // C++0x [temp.deduct.type]p22:
         //   If the original function parameter associated with A is a function
         //   parameter pack and the function parameter associated with P is not
         //   a function parameter pack, then template argument deduction fails.
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
       }
 
-      if (Sema::TemplateDeductionResult Result =
-              DeduceTemplateArgumentsByTypeMatch(
-                  S, TemplateParams, Params[ParamIdx].getUnqualifiedType(),
-                  Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
-                  PartialOrdering,
-                  /*DeducedFromArrayBound=*/false))
+      if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+              S, TemplateParams, Params[ParamIdx].getUnqualifiedType(),
+              Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
+              PartialOrdering,
+              /*DeducedFromArrayBound=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       ++ArgIdx;
@@ -1123,11 +1123,11 @@ DeduceTemplateArguments(Sema &S,
     if (ParamIdx + 1 == NumParams || PackScope.hasFixedArity()) {
       for (; ArgIdx < NumArgs && PackScope.hasNextElement(); ++ArgIdx) {
         // Deduce template arguments from the pattern.
-        if (Sema::TemplateDeductionResult Result =
-                DeduceTemplateArgumentsByTypeMatch(
-                    S, TemplateParams, Pattern.getUnqualifiedType(),
-                    Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
-                    PartialOrdering, /*DeducedFromArrayBound=*/false))
+        if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+                S, TemplateParams, Pattern.getUnqualifiedType(),
+                Args[ArgIdx].getUnqualifiedType(), Info, Deduced, TDF,
+                PartialOrdering, /*DeducedFromArrayBound=*/false);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         PackScope.nextPackElement();
@@ -1160,7 +1160,8 @@ DeduceTemplateArguments(Sema &S,
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -1172,13 +1173,13 @@ DeduceTemplateArguments(Sema &S,
   //   Ai is ignored;
   if (PartialOrdering && ArgIdx + 1 == NumArgs &&
       isa<PackExpansionType>(Args[ArgIdx]))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Make sure we don't have any extra arguments.
   if (ArgIdx < NumArgs)
-    return Sema::TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Determine whether the parameter has qualifiers that the argument
@@ -1286,7 +1287,7 @@ static CXXRecordDecl *getCanonicalRD(QualType T) {
 /// \returns the result of template argument deduction with the bases. "invalid"
 /// means no matches, "success" found a single item, and the
 /// "MiscellaneousDeductionFailure" result happens when the match is ambiguous.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
                     TemplateParameterList *TemplateParams, QualType P,
                     TemplateDeductionInfo &Info,
@@ -1338,13 +1339,13 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
     SmallVector<DeducedTemplateArgument, 8> DeducedCopy(Deduced.begin(),
                                                         Deduced.end());
     TemplateDeductionInfo BaseInfo(TemplateDeductionInfo::ForBase, Info);
-    Sema::TemplateDeductionResult BaseResult = DeduceTemplateSpecArguments(
+    TemplateDeductionResult BaseResult = DeduceTemplateSpecArguments(
         S, TemplateParams, P, NextT, BaseInfo, DeducedCopy);
 
     // If this was a successful deduction, add it to the list of matches,
     // otherwise we need to continue searching its bases.
     const CXXRecordDecl *RD = ::getCanonicalRD(NextT);
-    if (BaseResult == Sema::TDK_Success)
+    if (BaseResult == TemplateDeductionResult::Success)
       Matches.insert({RD, DeducedCopy});
     else
       AddBases(RD);
@@ -1374,12 +1375,12 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
   }
 
   if (Matches.empty())
-    return Sema::TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
   if (Matches.size() > 1)
-    return Sema::TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   std::swap(Matches.front().second, Deduced);
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Deduce the template arguments by comparing the parameter type and
@@ -1406,7 +1407,7 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
+static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     Sema &S, TemplateParameterList *TemplateParams, QualType P, QualType A,
     TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced, unsigned TDF,
@@ -1460,7 +1461,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
            PQuals.withoutObjCLifetime() == AQuals.withoutObjCLifetime())) {
         Info.FirstArg = TemplateArgument(P);
         Info.SecondArg = TemplateArgument(A);
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
     }
     Qualifiers DiscardedQuals;
@@ -1514,7 +1515,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     // Just skip any attempts to deduce from a placeholder type or a parameter
     // at a different depth.
     if (A->isPlaceholderType() || Info.getDeducedDepth() != TTP->getDepth())
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     unsigned Index = TTP->getIndex();
 
@@ -1534,13 +1535,13 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = TemplateArgument(P);
       Info.SecondArg = TemplateArgument(A);
-      return Sema::TDK_Underqualified;
+      return TemplateDeductionResult::Underqualified;
     }
 
     // Do not match a function type with a cv-qualified type.
     // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1584
     if (A->isFunctionType() && P.hasQualifiers())
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
 
     assert(TTP->getDepth() == Info.getDeducedDepth() &&
            "saw template type parameter with wrong depth");
@@ -1568,7 +1569,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = TemplateArgument(P);
       Info.SecondArg = TemplateArgument(A);
-      return Sema::TDK_Underqualified;
+      return TemplateDeductionResult::Underqualified;
     }
 
     // Objective-C ARC:
@@ -1588,11 +1589,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = Deduced[Index];
       Info.SecondArg = NewDeduced;
-      return Sema::TDK_Inconsistent;
+      return TemplateDeductionResult::Inconsistent;
     }
 
     Deduced[Index] = Result;
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Set up the template argument deduction information for a failure.
@@ -1604,19 +1605,19 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
   // at, so we have to wait until all of the parameter packs in this
   // expansion have arguments.
   if (P->getAs<SubstTemplateTypeParmPackType>())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Check the cv-qualifiers on the parameter and argument types.
   if (!(TDF & TDF_IgnoreQualifiers)) {
     if (TDF & TDF_ParamWithReferenceType) {
       if (hasInconsistentOrSupersetQualifiersOf(P, A))
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
     } else if (TDF & TDF_ArgWithReferenceType) {
       // C++ [temp.deduct.conv]p4:
       //   If the original A is a reference type, A can be more cv-qualified
       //   than the deduced A
       if (!A.getQualifiers().compatiblyIncludes(P.getQualifiers()))
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Strip out all extra qualifiers from the argument to figure out the
       // type we're converting to, prior to the qualification conversion.
@@ -1625,22 +1626,22 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       A = S.Context.getQualifiedType(A, P.getQualifiers());
     } else if (!IsPossiblyOpaquelyQualifiedType(P)) {
       if (P.getCVRQualifiers() != A.getCVRQualifiers())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   // If the parameter type is not dependent, there is nothing to deduce.
   if (!P->isDependentType()) {
     if (TDF & TDF_SkipNonDependent)
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if ((TDF & TDF_IgnoreQualifiers) ? S.Context.hasSameUnqualifiedType(P, A)
                                      : S.Context.hasSameType(P, A))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if (TDF & TDF_AllowCompatibleFunctionType &&
         S.isSameOrCompatibleFunctionType(P, A))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     if (!(TDF & TDF_IgnoreQualifiers))
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     // Otherwise, when ignoring qualifiers, the types not having the same
     // unqualified type does not mean they do not match, so in this case we
     // must keep going and analyze with a non-dependent parameter type.
@@ -1664,7 +1665,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // There's no corresponding wording for [temp.deduct.decl], but we treat
       // it the same to match other compilers.
       if (P->isDependentType())
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
       [[fallthrough]];
     case Type::Builtin:
     case Type::VariableArray:
@@ -1680,14 +1681,14 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
                      ((TDF & TDF_IgnoreQualifiers)
                           ? S.Context.hasSameUnqualifiedType(P, A)
                           : S.Context.hasSameType(P, A))
-                 ? Sema::TDK_Success
-                 : Sema::TDK_NonDeducedMismatch;
+                 ? TemplateDeductionResult::Success
+                 : TemplateDeductionResult::NonDeducedMismatch;
 
     //     _Complex T   [placeholder extension]
     case Type::Complex: {
       const auto *CP = P->castAs<ComplexType>(), *CA = A->getAs<ComplexType>();
       if (!CA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, CP->getElementType(), CA->getElementType(), Info,
           Deduced, TDF);
@@ -1697,7 +1698,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::Atomic: {
       const auto *PA = P->castAs<AtomicType>(), *AA = A->getAs<AtomicType>();
       if (!AA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, PA->getValueType(), AA->getValueType(), Info,
           Deduced, TDF);
@@ -1711,7 +1712,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       } else if (const auto *PA = A->getAs<ObjCObjectPointerType>()) {
         PointeeType = PA->getPointeeType();
       } else {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, P->castAs<PointerType>()->getPointeeType(),
@@ -1724,7 +1725,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *RP = P->castAs<LValueReferenceType>(),
                  *RA = A->getAs<LValueReferenceType>();
       if (!RA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, RP->getPointeeType(), RA->getPointeeType(), Info,
@@ -1736,7 +1737,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *RP = P->castAs<RValueReferenceType>(),
                  *RA = A->getAs<RValueReferenceType>();
       if (!RA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, RP->getPointeeType(), RA->getPointeeType(), Info,
@@ -1747,7 +1748,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::IncompleteArray: {
       const auto *IAA = S.Context.getAsIncompleteArrayType(A);
       if (!IAA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       const auto *IAP = S.Context.getAsIncompleteArrayType(P);
       assert(IAP && "Template parameter not of incomplete array type");
@@ -1763,7 +1764,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
                  *CAP = S.Context.getAsConstantArrayType(P);
       assert(CAP);
       if (!CAA || CAA->getSize() != CAP->getSize())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, CAP->getElementType(), CAA->getElementType(), Info,
@@ -1774,21 +1775,22 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::DependentSizedArray: {
       const auto *AA = S.Context.getAsArrayType(A);
       if (!AA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check the element type of the arrays
       const auto *DAP = S.Context.getAsDependentSizedArrayType(P);
       assert(DAP);
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, DAP->getElementType(), AA->getElementType(),
-              Info, Deduced, TDF & TDF_IgnoreQualifiers))
+              Info, Deduced, TDF & TDF_IgnoreQualifiers);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Determine the array bound is something we can deduce.
       const NonTypeTemplateParmDecl *NTTP =
           getDeducedParameterFromExpr(Info, DAP->getSizeExpr());
       if (!NTTP)
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
 
       // We can perform template argument deduction for the given non-type
       // template parameter.
@@ -1806,7 +1808,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, NTTP, DAA->getSizeExpr(), Info, Deduced);
 
       // Incomplete type does not match a dependently-sized array type
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     type(*)(T)
@@ -1816,30 +1818,32 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *FPP = P->castAs<FunctionProtoType>(),
                  *FPA = A->getAs<FunctionProtoType>();
       if (!FPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       if (FPP->getMethodQuals() != FPA->getMethodQuals() ||
           FPP->getRefQualifier() != FPA->getRefQualifier() ||
           FPP->isVariadic() != FPA->isVariadic())
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check return types.
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, FPP->getReturnType(), FPA->getReturnType(),
               Info, Deduced, 0,
               /*PartialOrdering=*/false,
-              /*DeducedFromArrayBound=*/false))
+              /*DeducedFromArrayBound=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Check parameter types.
       if (auto Result = DeduceTemplateArguments(
               S, TemplateParams, FPP->param_type_begin(), FPP->getNumParams(),
               FPA->param_type_begin(), FPA->getNumParams(), Info, Deduced,
-              TDF & TDF_TopLevelParameterTypeList, PartialOrdering))
+              TDF & TDF_TopLevelParameterTypeList, PartialOrdering);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       if (TDF & TDF_AllowCompatibleFunctionType)
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
 
       // FIXME: Per core-2016/10/1019 (no corresponding core issue yet), permit
       // deducing through the noexcept-specifier if it's part of the canonical
@@ -1877,7 +1881,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // Careful about [temp.deduct.call] and [temp.deduct.conv], which allow
       // top-level differences in noexcept-specifications.
 
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
     }
 
     case Type::InjectedClassName:
@@ -1901,7 +1905,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       auto Result =
           DeduceTemplateSpecArguments(S, TemplateParams, P, A, Info, Deduced);
-      if (Result == Sema::TDK_Success)
+      if (Result == TemplateDeductionResult::Success)
         return Result;
 
       // We cannot inspect base classes as part of deduction when the type
@@ -1916,7 +1920,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       // Check bases according to C++14 [temp.deduct.call] p4b3:
       auto BaseResult = DeduceTemplateBases(S, getCanonicalRD(A),
                                             TemplateParams, P, Info, Deduced);
-      return BaseResult != Sema::TDK_Invalid ? BaseResult : Result;
+      return BaseResult != TemplateDeductionResult::Invalid ? BaseResult
+                                                            : Result;
     }
 
     //     T type::*
@@ -1932,7 +1937,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MPP = P->castAs<MemberPointerType>(),
                  *MPA = A->getAs<MemberPointerType>();
       if (!MPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       QualType PPT = MPP->getPointeeType();
       if (PPT->isFunctionType())
@@ -1945,7 +1950,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       unsigned SubTDF = TDF & TDF_IgnoreQualifiers;
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
-              S, TemplateParams, PPT, APT, Info, Deduced, SubTDF))
+              S, TemplateParams, PPT, APT, Info, Deduced, SubTDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, QualType(MPP->getClass(), 0),
@@ -1961,7 +1967,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *BPP = P->castAs<BlockPointerType>(),
                  *BPA = A->getAs<BlockPointerType>();
       if (!BPA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       return DeduceTemplateArgumentsByTypeMatch(
           S, TemplateParams, BPP->getPointeeType(), BPA->getPointeeType(), Info,
           Deduced, 0);
@@ -1976,7 +1982,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       if (const auto *VA = A->getAs<ExtVectorType>()) {
         // Make sure that the vectors have the same number of elements.
         if (VP->getNumElements() != VA->getNumElements())
-          return Sema::TDK_NonDeducedMismatch;
+          return TemplateDeductionResult::NonDeducedMismatch;
         ElementType = VA->getElementType();
       } else if (const auto *VA = A->getAs<DependentSizedExtVectorType>()) {
         // We can't check the number of elements, since the argument has a
@@ -1984,7 +1990,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // ordering.
         ElementType = VA->getElementType();
       } else {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       // Perform deduction on the element types.
       return DeduceTemplateArgumentsByTypeMatch(
@@ -1999,14 +2005,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = VA->getNumElements();
@@ -2022,20 +2029,21 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              VA->getSizeExpr(), Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     (clang extension)
@@ -2048,14 +2056,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = VA->getNumElements();
@@ -2071,20 +2080,21 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the element types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, VP->getElementType(), VA->getElementType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the vector size, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, VP->getSizeExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              VA->getSizeExpr(), Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     //     (clang extension)
@@ -2095,12 +2105,12 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MP = P->castAs<ConstantMatrixType>(),
                  *MA = A->getAs<ConstantMatrixType>();
       if (!MA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check that the dimensions are the same
       if (MP->getNumRows() != MA->getNumRows() ||
           MP->getNumColumns() != MA->getNumColumns()) {
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       // Perform deduction on element types.
       return DeduceTemplateArgumentsByTypeMatch(
@@ -2112,12 +2122,13 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
       const auto *MP = P->castAs<DependentSizedMatrixType>();
       const auto *MA = A->getAs<MatrixType>();
       if (!MA)
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
 
       // Check the element type of the matrixes.
       if (auto Result = DeduceTemplateArgumentsByTypeMatch(
               S, TemplateParams, MP->getElementType(), MA->getElementType(),
-              Info, Deduced, TDF))
+              Info, Deduced, TDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Try to deduce a matrix dimension.
@@ -2132,26 +2143,26 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
               std::optional<llvm::APSInt> ParamConst =
                   ParamExpr->getIntegerConstantExpr(S.Context);
               if (!ParamConst)
-                return Sema::TDK_NonDeducedMismatch;
+                return TemplateDeductionResult::NonDeducedMismatch;
 
               if (ACM) {
                 if ((ACM->*GetArgDimension)() == *ParamConst)
-                  return Sema::TDK_Success;
-                return Sema::TDK_NonDeducedMismatch;
+                  return TemplateDeductionResult::Success;
+                return TemplateDeductionResult::NonDeducedMismatch;
               }
 
               Expr *ArgExpr = (ADM->*GetArgDimensionExpr)();
               if (std::optional<llvm::APSInt> ArgConst =
                       ArgExpr->getIntegerConstantExpr(S.Context))
                 if (*ArgConst == *ParamConst)
-                  return Sema::TDK_Success;
-              return Sema::TDK_NonDeducedMismatch;
+                  return TemplateDeductionResult::Success;
+              return TemplateDeductionResult::NonDeducedMismatch;
             }
 
             const NonTypeTemplateParmDecl *NTTP =
                 getDeducedParameterFromExpr(Info, ParamExpr);
             if (!NTTP)
-              return Sema::TDK_Success;
+              return TemplateDeductionResult::Success;
 
             if (ACM) {
               llvm::APSInt ArgConst(
@@ -2169,7 +2180,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       if (auto Result = DeduceMatrixArg(MP->getRowExpr(), MA,
                                         &ConstantMatrixType::getNumRows,
-                                        &DependentSizedMatrixType::getRowExpr))
+                                        &DependentSizedMatrixType::getRowExpr);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       return DeduceMatrixArg(MP->getColumnExpr(), MA,
@@ -2187,14 +2199,15 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the pointer type.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, ASP->getPointeeType(), ASA->getPointeeType(),
-                Info, Deduced, TDF))
+                Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the address space, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, ASP->getAddrSpaceExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(
             S, TemplateParams, NTTP, ASA->getAddrSpaceExpr(), Info, Deduced);
@@ -2208,33 +2221,34 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         // Perform deduction on the pointer types.
         if (auto Result = DeduceTemplateArgumentsByTypeMatch(
                 S, TemplateParams, ASP->getPointeeType(),
-                S.Context.removeAddrSpaceQualType(A), Info, Deduced, TDF))
+                S.Context.removeAddrSpaceQualType(A), Info, Deduced, TDF);
+            Result != TemplateDeductionResult::Success)
           return Result;
 
         // Perform deduction on the address space, if we can.
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, ASP->getAddrSpaceExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
                                              ArgAddressSpace, S.Context.IntTy,
                                              true, Info, Deduced);
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
     case Type::DependentBitInt: {
       const auto *IP = P->castAs<DependentBitIntType>();
 
       if (const auto *IA = A->getAs<BitIntType>()) {
         if (IP->isUnsigned() != IA->isUnsigned())
-          return Sema::TDK_NonDeducedMismatch;
+          return TemplateDeductionResult::NonDeducedMismatch;
 
         const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, IP->getNumBitsExpr());
         if (!NTTP)
-          return Sema::TDK_Success;
+          return TemplateDeductionResult::Success;
 
         llvm::APSInt ArgSize(S.Context.getTypeSize(S.Context.IntTy), false);
         ArgSize = IA->getNumBits();
@@ -2246,11 +2260,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       if (const auto *IA = A->getAs<DependentBitIntType>()) {
         if (IP->isUnsigned() != IA->isUnsigned())
-          return Sema::TDK_NonDeducedMismatch;
-        return Sema::TDK_Success;
+          return TemplateDeductionResult::NonDeducedMismatch;
+        return TemplateDeductionResult::Success;
       }
 
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
 
     case Type::TypeOfExpr:
@@ -2264,7 +2278,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::PackExpansion:
     case Type::Pipe:
       // No template argument deduction for these types
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     case Type::PackIndexing: {
       const PackIndexingType *PIT = P->getAs<PackIndexingType>();
@@ -2272,14 +2286,14 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
         return DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, PIT->getSelectedType(), A, Info, Deduced, TDF);
       }
-      return Sema::TDK_IncompletePack;
+      return TemplateDeductionResult::IncompletePack;
     }
     }
 
   llvm_unreachable("Invalid Type Class!");
 }
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         const TemplateArgument &P, TemplateArgument A,
                         TemplateDeductionInfo &Info,
@@ -2300,7 +2314,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
           S, TemplateParams, P.getAsType(), A.getAsType(), Info, Deduced, 0);
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Template:
     if (A.getKind() == TemplateArgument::Template)
@@ -2308,7 +2322,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                                      A.getAsTemplate(), Info, Deduced);
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::TemplateExpansion:
     llvm_unreachable("caller should handle pack expansions");
@@ -2316,38 +2330,38 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
   case TemplateArgument::Declaration:
     if (A.getKind() == TemplateArgument::Declaration &&
         isSameDeclaration(P.getAsDecl(), A.getAsDecl()))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::NullPtr:
     if (A.getKind() == TemplateArgument::NullPtr &&
         S.Context.hasSameType(P.getNullPtrType(), A.getNullPtrType()))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Integral:
     if (A.getKind() == TemplateArgument::Integral) {
       if (hasSameExtendedValue(P.getAsIntegral(), A.getAsIntegral()))
-        return Sema::TDK_Success;
+        return TemplateDeductionResult::Success;
     }
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::StructuralValue:
     if (A.getKind() == TemplateArgument::StructuralValue &&
         A.structurallyEquals(P))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Info.FirstArg = P;
     Info.SecondArg = A;
-    return Sema::TDK_NonDeducedMismatch;
+    return TemplateDeductionResult::NonDeducedMismatch;
 
   case TemplateArgument::Expression:
     if (const NonTypeTemplateParmDecl *NTTP =
@@ -2376,13 +2390,13 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
       case TemplateArgument::Pack:
         Info.FirstArg = P;
         Info.SecondArg = A;
-        return Sema::TDK_NonDeducedMismatch;
+        return TemplateDeductionResult::NonDeducedMismatch;
       }
       llvm_unreachable("Unknown template argument kind");
     }
 
     // Can't deduce anything, but that's okay.
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   case TemplateArgument::Pack:
     llvm_unreachable("Argument packs should be expanded by the caller!");
   }
@@ -2433,7 +2447,7 @@ static bool hasPackExpansionBeforeEnd(ArrayRef<TemplateArgument> Args) {
   return false;
 }
 
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         ArrayRef<TemplateArgument> Ps,
                         ArrayRef<TemplateArgument> As,
@@ -2445,7 +2459,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
   //   the last template argument, the entire template argument list is a
   //   non-deduced context.
   if (hasPackExpansionBeforeEnd(Ps))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // C++0x [temp.deduct.type]p9:
   //   If P has a form that contains <T> or <i>, then each argument Pi of the
@@ -2460,18 +2474,19 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
       // Check whether we have enough arguments.
       if (!hasTemplateArgumentForDeduction(As, ArgIdx))
         return NumberOfArgumentsMustMatch
-                   ? Sema::TDK_MiscellaneousDeductionFailure
-                   : Sema::TDK_Success;
+                   ? TemplateDeductionResult::MiscellaneousDeductionFailure
+                   : TemplateDeductionResult::Success;
 
       // C++1z [temp.deduct.type]p9:
       //   During partial ordering, if Ai was originally a pack expansion [and]
       //   Pi is not a pack expansion, template argument deduction fails.
       if (As[ArgIdx].isPackExpansion())
-        return Sema::TDK_MiscellaneousDeductionFailure;
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
       // Perform deduction for this Pi/Ai pair.
       if (auto Result = DeduceTemplateArguments(S, TemplateParams, P,
-                                                As[ArgIdx], Info, Deduced))
+                                                As[ArgIdx], Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       // Move to the next argument.
@@ -2499,7 +2514,8 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
          ++ArgIdx) {
       // Deduce template arguments from the pattern.
       if (auto Result = DeduceTemplateArguments(S, TemplateParams, Pattern,
-                                                As[ArgIdx], Info, Deduced))
+                                                As[ArgIdx], Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       PackScope.nextPackElement();
@@ -2507,11 +2523,12 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Determine whether two template arguments are the same.
@@ -2773,7 +2790,7 @@ static bool ConvertDeducedTemplateArgument(
 // ClassTemplatePartialSpecializationDecl sadly does not derive from
 // TemplateDecl.
 template <typename TemplateDeclT>
-static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
+static TemplateDeductionResult ConvertDeducedTemplateArguments(
     Sema &S, TemplateDeclT *Template, bool IsDeduced,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info,
@@ -2792,7 +2809,8 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
     // FIXME: Where did the word "trailing" come from?
     if (Deduced[I].isNull() && Param->isTemplateParameterPack()) {
       if (auto Result =
-              PackDeductionScope(S, TemplateParams, Deduced, Info, I).finish())
+              PackDeductionScope(S, TemplateParams, Deduced, Info, I).finish();
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
 
@@ -2829,7 +2847,7 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
         Info.reset(
             TemplateArgumentList::CreateCopy(S.Context, SugaredBuilder),
             TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
-        return Sema::TDK_SubstitutionFailure;
+        return TemplateDeductionResult::SubstitutionFailure;
       }
 
       continue;
@@ -2841,7 +2859,7 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
     if (!TD) {
       assert(isa<ClassTemplatePartialSpecializationDecl>(Template) ||
              isa<VarTemplatePartialSpecializationDecl>(Template));
-      return Sema::TDK_Incomplete;
+      return TemplateDeductionResult::Incomplete;
     }
 
     TemplateArgumentLoc DefArg;
@@ -2871,8 +2889,8 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
                  TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
       if (PartialOverloading) break;
 
-      return HasDefaultArg ? Sema::TDK_SubstitutionFailure
-                           : Sema::TDK_Incomplete;
+      return HasDefaultArg ? TemplateDeductionResult::SubstitutionFailure
+                           : TemplateDeductionResult::Incomplete;
     }
 
     // Check whether we can actually use the default argument.
@@ -2884,13 +2902,13 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
       // FIXME: These template arguments are temporary. Free them!
       Info.reset(TemplateArgumentList::CreateCopy(S.Context, SugaredBuilder),
                  TemplateArgumentList::CreateCopy(S.Context, CanonicalBuilder));
-      return Sema::TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     }
 
     // If we get here, we successfully used the default template argument.
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 static DeclContext *getAsDeclContextOrEnclosing(Decl *D) {
@@ -2926,7 +2944,7 @@ bool DeducedArgsNeedReplacement<ClassTemplatePartialSpecializationDecl>(
 }
 
 template <typename TemplateDeclT>
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
                                 ArrayRef<TemplateArgument> SugaredDeducedArgs,
                                 ArrayRef<TemplateArgument> CanonicalDeducedArgs,
@@ -2959,15 +2977,15 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
     Info.reset(
         TemplateArgumentList::CreateCopy(S.Context, SugaredDeducedArgs),
         TemplateArgumentList::CreateCopy(S.Context, CanonicalDeducedArgs));
-    return Sema::TDK_ConstraintsNotSatisfied;
+    return TemplateDeductionResult::ConstraintsNotSatisfied;
   }
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Complete template argument deduction for a partial specialization.
 template <typename T>
 static std::enable_if_t<IsPartialSpecialization<T>::value,
-                        Sema::TemplateDeductionResult>
+                        TemplateDeductionResult>
 FinishTemplateArgumentDeduction(
     Sema &S, T *Partial, bool IsPartialOrdering,
     ArrayRef<TemplateArgument> TemplateArgs,
@@ -2986,7 +3004,8 @@ FinishTemplateArgumentDeduction(
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
           S, Partial, IsPartialOrdering, Deduced, Info, SugaredBuilder,
-          CanonicalBuilder))
+          CanonicalBuilder);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Form the template argument list from the deduced template arguments.
@@ -3023,7 +3042,7 @@ FinishTemplateArgumentDeduction(
         Partial->getTemplateParameters()->getParam(ParamIdx));
     Info.Param = makeTemplateParameter(Param);
     Info.FirstArg = (*PartialTemplArgInfo)[ArgIdx].getArgument();
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
 
   bool ConstraintsNotSatisfied;
@@ -3033,8 +3052,9 @@ FinishTemplateArgumentDeduction(
           Template, Partial->getLocation(), InstArgs, false,
           SugaredConvertedInstArgs, CanonicalConvertedInstArgs,
           /*UpdateArgsWithConversions=*/true, &ConstraintsNotSatisfied))
-    return ConstraintsNotSatisfied ? Sema::TDK_ConstraintsNotSatisfied
-                                   : Sema::TDK_SubstitutionFailure;
+    return ConstraintsNotSatisfied
+               ? TemplateDeductionResult::ConstraintsNotSatisfied
+               : TemplateDeductionResult::SubstitutionFailure;
 
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
   for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) {
@@ -3044,24 +3064,25 @@ FinishTemplateArgumentDeduction(
       Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.FirstArg = TemplateArgs[I];
       Info.SecondArg = InstArg;
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (auto Result = CheckDeducedArgumentConstraints(S, Partial, SugaredBuilder,
-                                                    CanonicalBuilder, Info))
+                                                    CanonicalBuilder, Info);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Complete template argument deduction for a class or variable template,
 /// when partial ordering against a partial specialization.
 // FIXME: Factor out duplication with partial specialization version above.
-static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
+static TemplateDeductionResult FinishTemplateArgumentDeduction(
     Sema &S, TemplateDecl *Template, bool PartialOrdering,
     ArrayRef<TemplateArgument> TemplateArgs,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -3081,7 +3102,8 @@ static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
           S, Template, /*IsDeduced*/ PartialOrdering, Deduced, Info,
           SugaredBuilder, CanonicalBuilder,
           /*CurrentInstantiationScope=*/nullptr,
-          /*NumAlreadyConverted=*/0U, /*PartialOverloading=*/false))
+          /*NumAlreadyConverted=*/0U, /*PartialOverloading=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Check that we produced the correct argument list.
@@ -3093,29 +3115,30 @@ static Sema::TemplateDeductionResult FinishTemplateArgumentDeduction(
       Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.FirstArg = TemplateArgs[I];
       Info.SecondArg = InstArg;
-      return Sema::TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (auto Result = CheckDeducedArgumentConstraints(S, Template, SugaredBuilder,
-                                                    CanonicalBuilder, Info))
+                                                    CanonicalBuilder, Info);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Perform template argument deduction to determine whether
 /// the given template arguments match the given class template
 /// partial specialization per C++ [temp.class.spec.match].
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
                               ArrayRef<TemplateArgument> TemplateArgs,
                               TemplateDeductionInfo &Info) {
   if (Partial->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   // C++ [temp.class.spec.match]p2:
   //   A partial specialization matches a given actual template
@@ -3137,17 +3160,18 @@ Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
   if (TemplateDeductionResult Result = ::DeduceTemplateArguments(
           *this, Partial->getTemplateParameters(),
           Partial->getTemplateArgs().asArray(), TemplateArgs, Info, Deduced,
-          /*NumberOfArgumentsMustMatch=*/false))
+          /*NumberOfArgumentsMustMatch=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
   InstantiatingTemplate Inst(*this, Info.getLocation(), Partial, DeducedArgs,
                              Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
@@ -3161,12 +3185,12 @@ Sema::DeduceTemplateArguments(ClassTemplatePartialSpecializationDecl *Partial,
 /// Perform template argument deduction to determine whether
 /// the given template arguments match the given variable template
 /// partial specialization per C++ [temp.class.spec.match].
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceTemplateArguments(VarTemplatePartialSpecializationDecl *Partial,
                               ArrayRef<TemplateArgument> TemplateArgs,
                               TemplateDeductionInfo &Info) {
   if (Partial->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   // C++ [temp.class.spec.match]p2:
   //   A partial specialization matches a given actual template
@@ -3188,17 +3212,18 @@ Sema::DeduceTemplateArguments(VarTemplatePartialSpecializationDecl *Partial,
   if (TemplateDeductionResult Result = ::DeduceTemplateArguments(
           *this, Partial->getTemplateParameters(),
           Partial->getTemplateArgs().asArray(), TemplateArgs, Info, Deduced,
-          /*NumberOfArgumentsMustMatch=*/false))
+          /*NumberOfArgumentsMustMatch=*/false);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
   InstantiatingTemplate Inst(*this, Info.getLocation(), Partial, DeducedArgs,
                              Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (Trap.hasErrorOccurred())
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
@@ -3251,9 +3276,9 @@ static bool isSimpleTemplateIdType(QualType T) {
 /// \param Info if substitution fails for any reason, this object will be
 /// populated with more information about the failure.
 ///
-/// \returns TDK_Success if substitution was successful, or some failure
-/// condition.
-Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
+/// \returns TemplateDeductionResult::Success if substitution was successful, or
+/// some failure condition.
+TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo &ExplicitTemplateArgs,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -3271,7 +3296,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
 
     if (FunctionType)
       *FunctionType = Function->getType();
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Unevaluated SFINAE context.
@@ -3294,7 +3319,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
       CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   if (CheckTemplateArgumentList(FunctionTemplate, SourceLocation(),
                                 ExplicitTemplateArgs, true, SugaredBuilder,
@@ -3303,9 +3328,9 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       Trap.hasErrorOccurred()) {
     unsigned Index = SugaredBuilder.size();
     if (Index >= TemplateParams->size())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     Info.Param = makeTemplateParameter(TemplateParams->getParam(Index));
-    return TDK_InvalidExplicitArguments;
+    return TemplateDeductionResult::InvalidExplicitArguments;
   }
 
   // Form the template argument list from the explicitly-specified
@@ -3364,7 +3389,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     if (SubstParmTypes(Function->getLocation(), Function->parameters(),
                        Proto->getExtParameterInfosOrNull(), MLTAL, ParamTypes,
                        /*params=*/nullptr, ExtParamInfos))
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // Instantiate the return type.
@@ -3390,13 +3415,13 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
         SubstType(Proto->getReturnType(), MLTAL,
                   Function->getTypeSpecStartLoc(), Function->getDeclName());
     if (ResultType.isNull() || Trap.hasErrorOccurred())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
     // CUDA: Kernel function must have 'void' return type.
     if (getLangOpts().CUDA)
       if (Function->hasAttr<CUDAGlobalAttr>() && !ResultType->isVoidType()) {
         Diag(Function->getLocation(), diag::err_kern_type_not_void_return)
             << Function->getType() << Function->getSourceRange();
-        return TDK_SubstitutionFailure;
+        return TemplateDeductionResult::SubstitutionFailure;
       }
   }
 
@@ -3406,7 +3431,7 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       SubstParmTypes(Function->getLocation(), Function->parameters(),
                      Proto->getExtParameterInfosOrNull(), MLTAL, ParamTypes,
                      /*params*/ nullptr, ExtParamInfos))
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   if (FunctionType) {
     auto EPI = Proto->getExtProtoInfo();
@@ -3426,14 +3451,14 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
                 /*Pattern=*/nullptr,
                 /*ForConstraintInstantiation=*/false,
                 /*SkipForSpecialization=*/true)))
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
 
     *FunctionType = BuildFunctionType(ResultType, ParamTypes,
                                       Function->getLocation(),
                                       Function->getDeclName(),
                                       EPI);
     if (FunctionType->isNull() || Trap.hasErrorOccurred())
-      return TDK_SubstitutionFailure;
+      return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // C++ [temp.arg.explicit]p2:
@@ -3455,23 +3480,24 @@ Sema::TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
       Deduced.push_back(Arg);
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Check whether the deduced argument type for a call to a function
 /// template matches the actual argument type per C++ [temp.deduct.call]p4.
-static Sema::TemplateDeductionResult
+static TemplateDeductionResult
 CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
                               Sema::OriginalCallArg OriginalArg,
                               QualType DeducedA) {
   ASTContext &Context = S.Context;
 
-  auto Failed = [&]() -> Sema::TemplateDeductionResult {
+  auto Failed = [&]() -> TemplateDeductionResult {
     Info.FirstArg = TemplateArgument(DeducedA);
     Info.SecondArg = TemplateArgument(OriginalArg.OriginalArgType);
     Info.CallArgIndex = OriginalArg.ArgIdx;
-    return OriginalArg.DecomposedParam ? Sema::TDK_DeducedMismatchNested
-                                       : Sema::TDK_DeducedMismatch;
+    return OriginalArg.DecomposedParam
+               ? TemplateDeductionResult::DeducedMismatchNested
+               : TemplateDeductionResult::DeducedMismatch;
   };
 
   QualType A = OriginalArg.OriginalArgType;
@@ -3479,7 +3505,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
 
   // Check for type equality (top-level cv-qualifiers are ignored).
   if (Context.hasSameUnqualifiedType(A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   // Strip off references on the argument types; they aren't needed for
   // the following checks.
@@ -3503,7 +3529,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
     // the deduced A can be F.
     QualType Tmp;
     if (A->isFunctionType() && S.IsFunctionConversion(A, DeducedA, Tmp))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     Qualifiers AQuals = A.getQualifiers();
     Qualifiers DeducedAQuals = DeducedA.getQualifiers();
@@ -3544,7 +3570,7 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
       (S.IsQualificationConversion(A, DeducedA, false,
                                    ObjCLifetimeConversion) ||
        S.IsFunctionConversion(A, DeducedA, ResultTy)))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   //    - If P is a class and P has the form simple-template-id, then the
   //      transformed A can be a derived class of the deduced A. [...]
@@ -3565,11 +3591,11 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info,
   }
 
   if (Context.hasSameUnqualifiedType(A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   if (A->isRecordType() && isSimpleTemplateIdType(OriginalParamType) &&
       S.IsDerivedFrom(Info.getLocation(), A, DeducedA))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   return Failed();
 }
@@ -3607,7 +3633,7 @@ static unsigned getPackIndexForParam(Sema &S,
 // if `Specialization` is a `CXXConstructorDecl` or `CXXConversionDecl`,
 // we'll try to instantiate and update its explicit specifier after constraint
 // checking.
-static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
+static TemplateDeductionResult instantiateExplicitSpecifierDeferred(
     Sema &S, FunctionDecl *Specialization,
     const MultiLevelTemplateArgumentList &SubstArgs,
     TemplateDeductionInfo &Info, FunctionTemplateDecl *FunctionTemplate,
@@ -3626,24 +3652,24 @@ static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
   ExplicitSpecifier ES = GetExplicitSpecifier(Specialization);
   Expr *ExplicitExpr = ES.getExpr();
   if (!ExplicitExpr)
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   if (!ExplicitExpr->isValueDependent())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   Sema::InstantiatingTemplate Inst(
       S, Info.getLocation(), FunctionTemplate, DeducedArgs,
       Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return Sema::TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
   Sema::SFINAETrap Trap(S);
   const ExplicitSpecifier InstantiatedES =
       S.instantiateExplicitSpecifier(SubstArgs, ES);
   if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
     Specialization->setInvalidDecl(true);
-    return Sema::TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
   SetExplicitSpecifier(Specialization, InstantiatedES);
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Finish template argument deduction for a function template,
@@ -3652,7 +3678,7 @@ static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
 ///
 /// \param OriginalCallArgs If non-NULL, the original call arguments against
 /// which the deduced argument types should be compared.
-Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
+TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     FunctionTemplateDecl *FunctionTemplate,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
@@ -3671,7 +3697,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
       CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
   if (Inst.isInvalid())
-    return TDK_InstantiationDepth;
+    return TemplateDeductionResult::InstantiationDepth;
 
   ContextRAII SavedContext(*this, FunctionTemplate->getTemplatedDecl());
 
@@ -3682,7 +3708,8 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   if (auto Result = ConvertDeducedTemplateArguments(
           *this, FunctionTemplate, /*IsDeduced*/ true, Deduced, Info,
           SugaredBuilder, CanonicalBuilder, CurrentInstantiationScope,
-          NumExplicitlySpecified, PartialOverloading))
+          NumExplicitlySpecified, PartialOverloading);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // C++ [temp.deduct.call]p10: [DR1391]
@@ -3695,7 +3722,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   //   explicitly-specified template arguments, if the corresponding argument
   //   A cannot be implicitly converted to P, deduction fails.
   if (CheckNonDependent())
-    return TDK_NonDependentConversionFailure;
+    return TemplateDeductionResult::NonDependentConversionFailure;
 
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *SugaredDeducedArgumentList =
@@ -3732,7 +3759,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   Specialization = cast_or_null<FunctionDecl>(
       SubstDecl(FD, Owner, SubstArgs));
   if (!Specialization || Specialization->isInvalidDecl())
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
 
   assert(Specialization->getPrimaryTemplate()->getCanonicalDecl() ==
          FunctionTemplate->getCanonicalDecl());
@@ -3749,7 +3776,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   // failure.
   if (Trap.hasErrorOccurred()) {
     Specialization->setInvalidDecl(true);
-    return TDK_SubstitutionFailure;
+    return TemplateDeductionResult::SubstitutionFailure;
   }
 
   // C++2a [temp.deduct]p5
@@ -3766,12 +3793,12 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckInstantiatedFunctionTemplateConstraints(
             Info.getLocation(), Specialization, CanonicalBuilder,
             Info.AssociatedConstraintsSatisfaction))
-      return TDK_MiscellaneousDeductionFailure;
+      return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
       Info.reset(Info.takeSugared(),
                  TemplateArgumentList::CreateCopy(Context, CanonicalBuilder));
-      return TDK_ConstraintsNotSatisfied;
+      return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
 
@@ -3779,10 +3806,11 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   // substitution of `FD` before. So, we try to instantiate it back if
   // `Specialization` is either a constructor or a conversion function.
   if (isa<CXXConstructorDecl, CXXConversionDecl>(Specialization)) {
-    if (TDK_Success != instantiateExplicitSpecifierDeferred(
-                           *this, Specialization, SubstArgs, Info,
-                           FunctionTemplate, DeducedArgs)) {
-      return TDK_SubstitutionFailure;
+    if (TemplateDeductionResult::Success !=
+        instantiateExplicitSpecifierDeferred(*this, Specialization, SubstArgs,
+                                             Info, FunctionTemplate,
+                                             DeducedArgs)) {
+      return TemplateDeductionResult::SubstitutionFailure;
     }
   }
 
@@ -3829,7 +3857,8 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       }
 
       if (auto TDK =
-              CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA))
+              CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA);
+          TDK != TemplateDeductionResult::Success)
         return TDK;
     }
   }
@@ -3846,7 +3875,7 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
           .append(Info.diag_begin(), Info.diag_end());
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Gets the type of a function for template-argument-deducton
@@ -3938,7 +3967,8 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
       FunctionDecl *Specialization = nullptr;
       TemplateDeductionInfo Info(Ovl->getNameLoc());
       if (S.DeduceTemplateArguments(FunTmpl, &ExplicitTemplateArgs,
-                                    Specialization, Info))
+                                    Specialization,
+                                    Info) != TemplateDeductionResult::Success)
         continue;
 
       D = Specialization;
@@ -3968,10 +3998,10 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
     SmallVector<DeducedTemplateArgument, 8>
       Deduced(TemplateParams->size());
     TemplateDeductionInfo Info(Ovl->getNameLoc());
-    Sema::TemplateDeductionResult Result
-      = DeduceTemplateArgumentsByTypeMatch(S, TemplateParams, ParamType,
-                                           ArgType, Info, Deduced, TDF);
-    if (Result) continue;
+    TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+        S, TemplateParams, ParamType, ArgType, Info, Deduced, TDF);
+    if (Result != TemplateDeductionResult::Success)
+      continue;
     if (!Match.isNull())
       return {};
     Match = ArgType;
@@ -4084,7 +4114,7 @@ static bool
 hasDeducibleTemplateParameters(Sema &S, FunctionTemplateDecl *FunctionTemplate,
                                QualType T);
 
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
+static TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
     QualType ParamType, QualType ArgType,
     Expr::Classification ArgClassification, Expr *Arg,
@@ -4096,7 +4126,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
 
 /// Attempt template argument deduction from an initializer list
 ///        deemed to be an argument in a function call.
-static Sema::TemplateDeductionResult DeduceFromInitializerList(
+static TemplateDeductionResult DeduceFromInitializerList(
     Sema &S, TemplateParameterList *TemplateParams, QualType AdjustedParamType,
     InitListExpr *ILE, TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
@@ -4111,7 +4141,7 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
   //
   // We've already removed references and cv-qualifiers here.
   if (!ILE->getNumInits())
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   QualType ElTy;
   auto *ArrTy = S.Context.getAsArrayType(AdjustedParamType);
@@ -4120,14 +4150,14 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
   else if (!S.isStdInitializerList(AdjustedParamType, &ElTy)) {
     //   Otherwise, an initializer list argument causes the parameter to be
     //   considered a non-deduced context
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Resolving a core issue: a braced-init-list containing any designators is
   // a non-deduced context.
   for (Expr *E : ILE->inits())
     if (isa<DesignatedInitExpr>(E))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
   // Deduction only needs to be done for dependent types.
   if (ElTy->isDependentType()) {
@@ -4135,7 +4165,8 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
       if (auto Result = DeduceTemplateArgumentsFromCallArgument(
               S, TemplateParams, 0, ElTy, E->getType(),
               E->Classify(S.getASTContext()), E, Info, Deduced,
-              OriginalCallArgs, true, ArgIdx, TDF))
+              OriginalCallArgs, true, ArgIdx, TDF);
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
   }
@@ -4154,17 +4185,18 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
       llvm::APInt Size(S.Context.getIntWidth(T), ILE->getNumInits());
       if (auto Result = DeduceNonTypeTemplateArgument(
               S, TemplateParams, NTTP, llvm::APSInt(Size), T,
-              /*ArrayBound=*/true, Info, Deduced))
+              /*ArrayBound=*/true, Info, Deduced);
+          Result != TemplateDeductionResult::Success)
         return Result;
     }
   }
 
-  return Sema::TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Perform template argument deduction per [temp.deduct.call] for a
 ///        single parameter / argument pair.
-static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
+static TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
     QualType ParamType, QualType ArgType,
     Expr::Classification ArgClassification, Expr *Arg,
@@ -4181,7 +4213,7 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
   if (AdjustFunctionParmAndArgTypesForDeduction(
           S, TemplateParams, FirstInnerIndex, ParamType, ArgType,
           ArgClassification, Arg, TDF, FailedTSC))
-    return Sema::TDK_Success;
+    return TemplateDeductionResult::Success;
 
   //   If [...] the argument is a non-empty initializer list [...]
   if (InitListExpr *ILE = dyn_cast_if_present<InitListExpr>(Arg))
@@ -4221,11 +4253,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
 /// \param CheckNonDependent A callback to invoke to check conversions for
 /// non-dependent parameters, between deduction and substitution, per DR1391.
 /// If this returns true, substitution will be skipped and we return
-/// TDK_NonDependentConversionFailure. The callback is passed the parameter
-/// types (after substituting explicit template arguments).
+/// TemplateDeductionResult::NonDependentConversionFailure. The callback is
+/// passed the parameter types (after substituting explicit template arguments).
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
@@ -4233,7 +4265,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     QualType ObjectType, Expr::Classification ObjectClassification,
     llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent) {
   if (FunctionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   unsigned NumParams = Function->getNumParams();
@@ -4252,14 +4284,14 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   //   of the call (call it A) as described below.
   if (Args.size() < Function->getMinRequiredExplicitArguments() &&
       !PartialOverloading)
-    return TDK_TooFewArguments;
+    return TemplateDeductionResult::TooFewArguments;
   else if (TooManyArguments(NumParams, Args.size() + ExplicitObjectOffset,
                             PartialOverloading)) {
     const auto *Proto = Function->getType()->castAs<FunctionProtoType>();
     if (Proto->isTemplateVariadic())
       /* Do nothing */;
     else if (!Proto->isVariadic())
-      return TDK_TooManyArguments;
+      return TemplateDeductionResult::TooManyArguments;
   }
 
   // The types of the parameters from which we will perform template argument
@@ -4277,7 +4309,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
           FunctionTemplate, *ExplicitTemplateArgs, Deduced, ParamTypes, nullptr,
           Info);
     });
-    if (Result)
+    if (Result != TemplateDeductionResult::Success)
       return Result;
 
     NumExplicitlySpecified = Deduced.size();
@@ -4297,7 +4329,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     //   parameter that contains template-parameters that participate in
     //   template argument deduction ...
     if (!hasDeducibleTemplateParameters(*this, FunctionTemplate, ParamType))
-      return Sema::TDK_Success;
+      return TemplateDeductionResult::Success;
 
     if (ExplicitObjetArgument) {
       //   ... with the type of the corresponding argument
@@ -4334,13 +4366,15 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
       if (ParamIdx == 0 && HasExplicitObject) {
         if (auto Result = DeduceCallArgument(ParamType, 0,
-                                             /*ExplicitObjetArgument=*/true))
+                                             /*ExplicitObjetArgument=*/true);
+            Result != TemplateDeductionResult::Success)
           return Result;
         continue;
       }
 
       if (auto Result = DeduceCallArgument(ParamType, ArgIdx++,
-                                           /*ExplicitObjetArgument=*/false))
+                                           /*ExplicitObjetArgument=*/false);
+          Result != TemplateDeductionResult::Success)
         return Result;
 
       continue;
@@ -4374,7 +4408,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
            PackScope.nextPackElement(), ++ArgIdx) {
         ParamTypesForArgChecking.push_back(ParamPattern);
         if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx,
-                                             /*ExplicitObjetArgument=*/false))
+                                             /*ExplicitObjetArgument=*/false);
+            Result != TemplateDeductionResult::Success)
           return Result;
       }
     } else {
@@ -4414,7 +4449,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
         for (; ArgIdx < PackArgEnd && ArgIdx < Args.size(); ArgIdx++) {
           ParamTypesForArgChecking.push_back(ParamPattern);
           if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx,
-                                               /*ExplicitObjetArgument=*/false))
+                                               /*ExplicitObjetArgument=*/false);
+              Result != TemplateDeductionResult::Success)
             return Result;
 
           PackScope.nextPackElement();
@@ -4424,7 +4460,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    if (auto Result = PackScope.finish())
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -4508,13 +4545,13 @@ QualType Sema::adjustCCAndNoReturn(QualType ArgFunctionType,
 /// specialization based on its signature, per [temp.deduct.decl].
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo *ExplicitTemplateArgs, QualType ArgFunctionType,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
     bool IsAddressOfFunction) {
   if (FunctionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   TemplateParameterList *TemplateParams
@@ -4533,7 +4570,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
           FunctionTemplate, *ExplicitTemplateArgs, Deduced, ParamTypes,
           &FunctionType, Info);
     });
-    if (Result)
+    if (Result != TemplateDeductionResult::Success)
       return Result;
 
     NumExplicitlySpecified = Deduced.size();
@@ -4566,10 +4603,10 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     unsigned TDF =
         TDF_TopLevelParameterTypeList | TDF_AllowCompatibleFunctionType;
     // Deduce template arguments from the function type.
-    if (TemplateDeductionResult Result
-          = DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams,
-                                               FunctionType, ArgFunctionType,
-                                               Info, Deduced, TDF))
+    if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+            *this, TemplateParams, FunctionType, ArgFunctionType, Info, Deduced,
+            TDF);
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
@@ -4579,7 +4616,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
                                              NumExplicitlySpecified,
                                              Specialization, Info);
   });
-  if (Result)
+  if (Result != TemplateDeductionResult::Success)
     return Result;
 
   // If the function has a deduced return type, deduce it now, so we can check
@@ -4587,13 +4624,13 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   if (HasDeducedReturnType && IsAddressOfFunction &&
       Specialization->getReturnType()->isUndeducedType() &&
       DeduceReturnType(Specialization, Info.getLocation(), false))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
       Specialization->isImmediateEscalating() &&
       CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                Info.getLocation()))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   // If the function has a dependent exception specification, resolve it now,
   // so we can check that the exception specification matches.
@@ -4602,7 +4639,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   if (getLangOpts().CPlusPlus17 &&
       isUnresolvedExceptionSpec(SpecializationFPT->getExceptionSpecType()) &&
       !ResolveExceptionSpec(Info.getLocation(), SpecializationFPT))
-    return TDK_MiscellaneousDeductionFailure;
+    return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
   // Adjust the exception specification of the argument to match the
   // substituted and resolved type we just formed. (Calling convention and
@@ -4632,22 +4669,22 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
             : !Context.hasSameType(SpecializationType, ArgFunctionType)) {
       Info.FirstArg = TemplateArgument(SpecializationType);
       Info.SecondArg = TemplateArgument(ArgFunctionType);
-      return TDK_NonDeducedMismatch;
+      return TemplateDeductionResult::NonDeducedMismatch;
     }
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 /// Deduce template arguments for a templated conversion
 /// function (C++ [temp.deduct.conv]) and, if successful, produce a
 /// conversion function template specialization.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *ConversionTemplate, QualType ObjectType,
     Expr::Classification ObjectClassification, QualType ToType,
     CXXConversionDecl *&Specialization, TemplateDeductionInfo &Info) {
   if (ConversionTemplate->isInvalidDecl())
-    return TDK_Invalid;
+    return TemplateDeductionResult::Invalid;
 
   CXXConversionDecl *ConversionGeneric
     = cast<CXXConversionDecl>(ConversionTemplate->getTemplatedDecl());
@@ -4749,13 +4786,14 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
                 *this, TemplateParams, getFirstInnerIndex(ConversionTemplate),
                 ParamType, ObjectType, ObjectClassification,
                 /*Arg=*/nullptr, Info, Deduced, OriginalCallArgs,
-                /*Decomposed*/ false, 0, /*TDF*/ 0))
+                /*Decomposed*/ false, 0, /*TDF*/ 0);
+        Result != TemplateDeductionResult::Success)
       return Result;
   }
 
-  if (TemplateDeductionResult Result
-        = DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams,
-                                             P, A, Info, Deduced, TDF))
+  if (TemplateDeductionResult Result = DeduceTemplateArgumentsByTypeMatch(
+          *this, TemplateParams, P, A, Info, Deduced, TDF);
+      Result != TemplateDeductionResult::Success)
     return Result;
 
   // Create an Instantiation Scope for finalizing the operator.
@@ -4796,11 +4834,12 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 /// naming a function template specialization.
 ///
 /// \returns the result of template argument deduction.
-Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
-    FunctionTemplateDecl *FunctionTemplate,
-    TemplateArgumentListInfo *ExplicitTemplateArgs,
-    FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
-    bool IsAddressOfFunction) {
+TemplateDeductionResult
+Sema::DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
+                              TemplateArgumentListInfo *ExplicitTemplateArgs,
+                              FunctionDecl *&Specialization,
+                              TemplateDeductionInfo &Info,
+                              bool IsAddressOfFunction) {
   return DeduceTemplateArguments(FunctionTemplate, ExplicitTemplateArgs,
                                  QualType(), Specialization, Info,
                                  IsAddressOfFunction);
@@ -4962,14 +5001,14 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
 ///        should be specified in the 'Info' parameter.
 /// \param IgnoreConstraints Set if we should not fail if the deduced type does
 ///                          not satisfy the type-constraint in the auto type.
-Sema::TemplateDeductionResult
+TemplateDeductionResult
 Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
                      TemplateDeductionInfo &Info, bool DependentDeduction,
                      bool IgnoreConstraints,
                      TemplateSpecCandidateSet *FailedTSC) {
   assert(DependentDeduction || Info.getDeducedDepth() == 0);
   if (Init->containsErrors())
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   const AutoType *AT = Type.getType()->getContainedAutoType();
   assert(AT);
@@ -4977,7 +5016,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (Init->getType()->isNonOverloadPlaceholderType() || AT->isDecltypeAuto()) {
     ExprResult NonPlaceholder = CheckPlaceholderExpr(Init);
     if (NonPlaceholder.isInvalid())
-      return TDK_AlreadyDiagnosed;
+      return TemplateDeductionResult::AlreadyDiagnosed;
     Init = NonPlaceholder.get();
   }
 
@@ -4989,7 +5028,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
        Init->containsUnexpandedParameterPack())) {
     Result = SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type);
     assert(!Result.isNull() && "substituting DependentTy can't fail");
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Make sure that we treat 'char[]' equaly as 'char*' in C23 mode.
@@ -4999,7 +5038,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     TypeLoc TL = TypeLoc(Init->getType(), Type.getOpaqueData());
     Result = SubstituteDeducedTypeTransform(*this, DependentResult).Apply(TL);
     assert(!Result.isNull() && "substituting DependentTy can't fail");
-    return TDK_Success;
+    return TemplateDeductionResult::Success;
   }
 
   // Emit a warning if 'auto*' is used in pedantic and in C23 mode.
@@ -5011,7 +5050,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (!getLangOpts().CPlusPlus && InitList) {
     Diag(Init->getBeginLoc(), diag::err_auto_init_list_from_c)
         << (int)AT->getKeyword() << getLangOpts().C23;
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
   }
 
   // Deduce type of TemplParam in Func(Init)
@@ -5025,7 +5064,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
       Result =
           SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type);
       assert(!Result.isNull() && "substituting DependentTy can't fail");
-      return TDK_Success;
+      return TemplateDeductionResult::Success;
     }
     return TDK;
   };
@@ -5037,7 +5076,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (AT->isDecltypeAuto()) {
     if (InitList) {
       Diag(Init->getBeginLoc(), diag::err_decltype_auto_initializer_list);
-      return TDK_AlreadyDiagnosed;
+      return TemplateDeductionResult::AlreadyDiagnosed;
     }
 
     DeducedType = getDecltypeForExpr(Init);
@@ -5060,24 +5099,25 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
       // deduce against that. Such deduction only succeeds if removing
       // cv-qualifiers and references results in std::initializer_list<T>.
       if (!Type.getType().getNonReferenceType()->getAs<AutoType>())
-        return TDK_Invalid;
+        return TemplateDeductionResult::Invalid;
 
       SourceRange DeducedFromInitRange;
       for (Expr *Init : InitList->inits()) {
         // Resolving a core issue: a braced-init-list containing any designators
         // is a non-deduced context.
         if (isa<DesignatedInitExpr>(Init))
-          return TDK_Invalid;
+          return TemplateDeductionResult::Invalid;
         if (auto TDK = DeduceTemplateArgumentsFromCallArgument(
                 *this, TemplateParamsSt.get(), 0, TemplArg, Init->getType(),
                 Init->Classify(getASTContext()), Init, Info, Deduced,
                 OriginalCallArgs, /*Decomposed=*/true,
-                /*ArgIdx=*/0, /*TDF=*/0)) {
-          if (TDK == TDK_Inconsistent) {
+                /*ArgIdx=*/0, /*TDF=*/0);
+            TDK != TemplateDeductionResult::Success) {
+          if (TDK == TemplateDeductionResult::Inconsistent) {
             Diag(Info.getLocation(), diag::err_auto_inconsistent_deduction)
                 << Info.FirstArg << Info.SecondArg << DeducedFromInitRange
                 << Init->getSourceRange();
-            return DeductionFailed(TDK_AlreadyDiagnosed);
+            return DeductionFailed(TemplateDeductionResult::AlreadyDiagnosed);
           }
           return DeductionFailed(TDK);
         }
@@ -5089,7 +5129,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     } else {
       if (!getLangOpts().CPlusPlus && Init->refersToBitField()) {
         Diag(Loc, diag::err_auto_bitfield);
-        return TDK_AlreadyDiagnosed;
+        return TemplateDeductionResult::AlreadyDiagnosed;
       }
       QualType FuncParam =
           SubstituteDeducedTypeTransform(*this, TemplArg).Apply(Type);
@@ -5099,19 +5139,20 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
               *this, TemplateParamsSt.get(), 0, FuncParam, Init->getType(),
               Init->Classify(getASTContext()), Init, Info, Deduced,
               OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0,
-              FailedTSC))
+              FailedTSC);
+          TDK != TemplateDeductionResult::Success)
         return DeductionFailed(TDK);
     }
 
     // Could be null if somehow 'auto' appears in a non-deduced context.
     if (Deduced[0].getKind() != TemplateArgument::Type)
-      return DeductionFailed(TDK_Incomplete);
+      return DeductionFailed(TemplateDeductionResult::Incomplete);
     DeducedType = Deduced[0].getAsType();
 
     if (InitList) {
       DeducedType = BuildStdInitializerList(DeducedType, Loc);
       if (DeducedType.isNull())
-        return TDK_AlreadyDiagnosed;
+        return TemplateDeductionResult::AlreadyDiagnosed;
     }
   }
 
@@ -5119,7 +5160,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     if (!Context.hasSameType(DeducedType, Result)) {
       Info.FirstArg = Result;
       Info.SecondArg = DeducedType;
-      return DeductionFailed(TDK_Inconsistent);
+      return DeductionFailed(TemplateDeductionResult::Inconsistent);
     }
     DeducedType = Context.getCommonSugaredType(Result, DeducedType);
   }
@@ -5127,11 +5168,11 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   if (AT->isConstrained() && !IgnoreConstraints &&
       CheckDeducedPlaceholderConstraints(
           *this, *AT, Type.getContainedAutoTypeLoc(), DeducedType))
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   Result = SubstituteDeducedTypeTransform(*this, DeducedType).Apply(Type);
   if (Result.isNull())
-    return TDK_AlreadyDiagnosed;
+    return TemplateDeductionResult::AlreadyDiagnosed;
 
   // Check that the deduced argument type is compatible with the original
   // argument type per C++ [temp.deduct.call]p4.
@@ -5140,13 +5181,14 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     assert((bool)InitList == OriginalArg.DecomposedParam &&
            "decomposed non-init-list in auto deduction?");
     if (auto TDK =
-            CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA)) {
+            CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA);
+        TDK != TemplateDeductionResult::Success) {
       Result = QualType();
       return DeductionFailed(TDK);
     }
   }
 
-  return TDK_Success;
+  return TemplateDeductionResult::Success;
 }
 
 QualType Sema::SubstAutoType(QualType TypeWithAuto,
@@ -5403,7 +5445,8 @@ static bool isAtLeastAsSpecializedAs(Sema &S,
 
     if (DeduceTemplateArguments(S, TemplateParams, Args2.data(), Args2.size(),
                                 Args1.data(), Args1.size(), Info, Deduced,
-                                TDF_None, /*PartialOrdering=*/true))
+                                TDF_None, /*PartialOrdering=*/true) !=
+        TemplateDeductionResult::Success)
       return false;
 
     break;
@@ -5415,17 +5458,17 @@ static bool isAtLeastAsSpecializedAs(Sema &S,
     if (DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, Proto2->getReturnType(), Proto1->getReturnType(),
             Info, Deduced, TDF_None,
-            /*PartialOrdering=*/true))
+            /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
 
   case TPOC_Other:
     //   - In other contexts (14.6.6.2) the function template's function type
     //     is used.
-    if (DeduceTemplateArgumentsByTypeMatch(S, TemplateParams,
-                                           FD2->getType(), FD1->getType(),
-                                           Info, Deduced, TDF_None,
-                                           /*PartialOrdering=*/true))
+    if (DeduceTemplateArgumentsByTypeMatch(
+            S, TemplateParams, FD2->getType(), FD1->getType(), Info, Deduced,
+            TDF_None,
+            /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
   }
@@ -5599,9 +5642,12 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
                                       Sema::TPL_TemplateParamsEquivalent))
     return nullptr;
 
+  // [dcl.fct]p5:
+  // Any top-level cv-qualifiers modifying a parameter type are deleted when
+  // forming the function type.
   for (unsigned i = 0; i < NumParams1; ++i)
-    if (!Context.hasSameType(FD1->getParamDecl(i)->getType(),
-                             FD2->getParamDecl(i)->getType()))
+    if (!Context.hasSameUnqualifiedType(FD1->getParamDecl(i)->getType(),
+                                        FD2->getParamDecl(i)->getType()))
       return nullptr;
 
   // C++20 [temp.func.order]p6.3:
@@ -5776,9 +5822,9 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
 
   // Determine whether P1 is at least as specialized as P2.
   Deduced.resize(P2->getTemplateParameters()->size());
-  if (DeduceTemplateArgumentsByTypeMatch(S, P2->getTemplateParameters(),
-                                         T2, T1, Info, Deduced, TDF_None,
-                                         /*PartialOrdering=*/true))
+  if (DeduceTemplateArgumentsByTypeMatch(
+          S, P2->getTemplateParameters(), T2, T1, Info, Deduced, TDF_None,
+          /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
     return false;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(),
@@ -5791,9 +5837,10 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
   const auto *TST1 = cast<TemplateSpecializationType>(T1);
   bool AtLeastAsSpecialized;
   S.runWithSufficientStackSpace(Info.getLocation(), [&] {
-    AtLeastAsSpecialized = !FinishTemplateArgumentDeduction(
-        S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(), Deduced,
-        Info);
+    AtLeastAsSpecialized =
+        FinishTemplateArgumentDeduction(
+            S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(),
+            Deduced, Info) == TemplateDeductionResult::Success;
   });
   return AtLeastAsSpecialized;
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 6d59180bc446..371378485626 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -3693,9 +3693,9 @@ bool Sema::usesPartialOrExplicitSpecialization(
                    ->getPartialSpecializations(PartialSpecs);
   for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
     TemplateDeductionInfo Info(Loc);
-    if (!DeduceTemplateArguments(PartialSpecs[I],
-                                 ClassTemplateSpec->getTemplateArgs().asArray(),
-                                 Info))
+    if (DeduceTemplateArguments(PartialSpecs[I],
+                                ClassTemplateSpec->getTemplateArgs().asArray(),
+                                Info) == TemplateDeductionResult::Success)
       return true;
   }
 
@@ -3739,8 +3739,9 @@ getPatternForClassTemplateSpecialization(
     for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
       ClassTemplatePartialSpecializationDecl *Partial = PartialSpecs[I];
       TemplateDeductionInfo Info(FailedCandidates.getLocation());
-      if (Sema::TemplateDeductionResult Result = S.DeduceTemplateArguments(
-              Partial, ClassTemplateSpec->getTemplateArgs().asArray(), Info)) {
+      if (TemplateDeductionResult Result = S.DeduceTemplateArguments(
+              Partial, ClassTemplateSpec->getTemplateArgs().asArray(), Info);
+          Result != TemplateDeductionResult::Success) {
         // Store the failed-deduction information for use in diagnostics, later.
         // TODO: Actually use the failed-deduction info?
         FailedCandidates.addCandidate().set(
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3ed17c3360a8..6e5ae123a6ba 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -27,6 +27,7 @@
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/OpenMPKinds.h"
@@ -3995,6 +3996,13 @@ public:
     return getSema().CreateRecoveryExpr(BeginLoc, EndLoc, SubExprs, Type);
   }
 
+  StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K,
+                                            SourceLocation BeginLoc,
+                                            SourceLocation EndLoc,
+                                            StmtResult StrBlock) {
+    llvm_unreachable("Not yet implemented!");
+  }
+
 private:
   TypeLoc TransformTypeInObjectScope(TypeLoc TL,
                                      QualType ObjectType,
@@ -10994,6 +11002,21 @@ OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC transformation
+//===----------------------------------------------------------------------===//
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCComputeConstruct(
+    OpenACCComputeConstruct *C) {
+  // TODO OpenACC: Transform clauses.
+
+  // Transform Structured Block.
+  StmtResult StrBlock = getDerived().TransformStmt(C->getStructuredBlock());
+
+  return getDerived().RebuildOpenACCComputeConstruct(
+      C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(), StrBlock);
+}
+
+//===----------------------------------------------------------------------===//
 // Expression transformation
 //===----------------------------------------------------------------------===//
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c9217f7aac08..eea14a66fa18 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -987,9 +987,13 @@ ASTIdentifierLookupTraitBase::ReadKey(const unsigned char* d, unsigned n) {
 /// Whether the given identifier is "interesting".
 static bool isInterestingIdentifier(ASTReader &Reader, IdentifierInfo &II,
                                     bool IsModule) {
+  bool IsInteresting =
+      II.getInterestingIdentifierID() !=
+          tok::InterestingIdentifierKind::not_interesting ||
+      II.getBuiltinID() != Builtin::ID::NotBuiltin ||
+      II.getObjCKeywordID() != tok::ObjCKeywordKind::objc_not_keyword;
   return II.hadMacroDefinition() || II.isPoisoned() ||
-         (!IsModule && II.getObjCOrBuiltinID()) ||
-         II.hasRevertedTokenIDToIdentifier() ||
+         (!IsModule && IsInteresting) || II.hasRevertedTokenIDToIdentifier() ||
          (!(IsModule && Reader.getPreprocessor().getLangOpts().CPlusPlus) &&
           II.getFETokenInfo());
 }
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index d79f194fd16c..3da44ffccc38 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2789,6 +2789,26 @@ void ASTStmtReader::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC Constructs/Directives.
+//===----------------------------------------------------------------------===//
+void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
+  S->Kind = Record.readEnum<OpenACCDirectiveKind>();
+  S->Range = Record.readSourceRange();
+  // TODO OpenACC: Deserialize Clauses.
+}
+
+void ASTStmtReader::VisitOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+  S->setAssociatedStmt(Record.readSubStmt());
+}
+
+void ASTStmtReader::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+//===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
 
@@ -4206,6 +4226,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) ConceptSpecializationExpr(Empty);
       break;
     }
+    case STMT_OPENACC_COMPUTE_CONSTRUCT:
+      S = OpenACCComputeConstruct::CreateEmpty(Context, Empty);
+      break;
 
     case EXPR_REQUIRES:
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 83ad4cb5b560..7966b3175ec9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3597,8 +3597,13 @@ class ASTIdentifierTableTrait {
   /// doesn't check whether the name has macros defined; use PublicMacroIterator
   /// to check that.
   bool isInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset) {
-    if (MacroOffset || II->isPoisoned() ||
-        (!IsModule && II->getObjCOrBuiltinID()) ||
+    II->getObjCOrBuiltinID();
+    bool IsInteresting =
+        II->getInterestingIdentifierID() !=
+            tok::InterestingIdentifierKind::not_interesting ||
+        II->getBuiltinID() != Builtin::ID::NotBuiltin ||
+        II->getObjCKeywordID() != tok::ObjCKeywordKind::objc_not_keyword;
+    if (MacroOffset || II->isPoisoned() || (!IsModule && IsInteresting) ||
         II->hasRevertedTokenIDToIdentifier() ||
         (NeedDecls && II->getFETokenInfo()))
       return true;
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 5b0b90234c41..484621ae8130 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2839,6 +2839,27 @@ void ASTStmtWriter::VisitOMPTargetParallelGenericLoopDirective(
 }
 
 //===----------------------------------------------------------------------===//
+// OpenACC Constructs/Directives.
+//===----------------------------------------------------------------------===//
+void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
+  Record.writeEnum(S->Kind);
+  Record.AddSourceRange(S->Range);
+  // TODO OpenACC: Serialize Clauses.
+}
+
+void ASTStmtWriter::VisitOpenACCAssociatedStmtConstruct(
+    OpenACCAssociatedStmtConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+  Record.AddStmt(S->getAssociatedStmt());
+}
+
+void ASTStmtWriter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_COMPUTE_CONSTRUCT;
+}
+
+//===----------------------------------------------------------------------===//
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 0c6293e67a86..6b8ac2629453 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -2023,13 +2023,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                                            {{EOFv, EOFv}, {0, UCharRangeMax}},
                                            "an unsigned char value or EOF")));
 
-  // The getc() family of functions that returns either a char or an EOF.
-  addToFunctionSummaryMap(
-      {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
-      Summary(NoEvalCall)
-          .Case({ReturnValueCondition(WithinRange,
-                                      {{EOFv, EOFv}, {0, UCharRangeMax}})},
-                ErrnoIrrelevant));
   addToFunctionSummaryMap(
       "getchar", Signature(ArgTypes{}, RetType{IntTy}),
       Summary(NoEvalCall)
@@ -2139,7 +2132,17 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         std::move(GetenvSummary));
   }
 
-  if (ModelPOSIX) {
+  if (!ModelPOSIX) {
+    // Without POSIX use of 'errno' is not specified (in these cases).
+    // Add these functions without 'errno' checks.
+    addToFunctionSummaryMap(
+        {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(WithinRange,
+                                        {{EOFv, EOFv}, {0, UCharRangeMax}})},
+                  ErrnoIrrelevant)
+            .ArgConstraint(NotNull(ArgNo(0))));
+  } else {
     const auto ReturnsZeroOrMinusOne =
         ConstraintSet{ReturnValueCondition(WithinRange, Range(-1, 0))};
     const auto ReturnsZero =
@@ -2231,6 +2234,63 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .Case(ReturnsMinusOne, ErrnoNEZeroIrrelevant, GenericFailureMsg)
             .ArgConstraint(NotNull(ArgNo(0))));
 
+    std::optional<QualType> Off_tTy = lookupTy("off_t");
+    std::optional<RangeInt> Off_tMax = getMaxValue(Off_tTy);
+
+    // int fgetc(FILE *stream);
+    // 'getc' is the same as 'fgetc' but may be a macro
+    addToFunctionSummaryMap(
+        {"getc", "fgetc"}, Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(WithinRange, {{0, UCharRangeMax}})},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0))));
+
+    // int fputc(int c, FILE *stream);
+    // 'putc' is the same as 'fputc' but may be a macro
+    addToFunctionSummaryMap(
+        {"putc", "fputc"},
+        Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case({ArgumentCondition(0, WithinRange, Range(0, UCharRangeMax)),
+                   ReturnValueCondition(BO_EQ, ArgNo(0))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ArgumentCondition(0, OutOfRange, Range(0, UCharRangeMax)),
+                   ReturnValueCondition(WithinRange, Range(0, UCharRangeMax))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(1))));
+
+    // char *fgets(char *restrict s, int n, FILE *restrict stream);
+    addToFunctionSummaryMap(
+        "fgets",
+        Signature(ArgTypes{CharPtrRestrictTy, IntTy, FilePtrRestrictTy},
+                  RetType{CharPtrTy}),
+        Summary(NoEvalCall)
+            .Case({ReturnValueCondition(BO_EQ, ArgNo(0))},
+                  ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({IsNull(Ret)}, ErrnoIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0)))
+            .ArgConstraint(ArgumentCondition(1, WithinRange, Range(0, IntMax)))
+            .ArgConstraint(
+                BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1)))
+            .ArgConstraint(NotNull(ArgNo(2))));
+
+    // int fputs(const char *restrict s, FILE *restrict stream);
+    addToFunctionSummaryMap(
+        "fputs",
+        Signature(ArgTypes{ConstCharPtrRestrictTy, FilePtrRestrictTy},
+                  RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case(ReturnsNonnegative, ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(NotNull(ArgNo(0)))
+            .ArgConstraint(NotNull(ArgNo(1))));
+
     // int ungetc(int c, FILE *stream);
     addToFunctionSummaryMap(
         "ungetc", Signature(ArgTypes{IntTy, FilePtrTy}, RetType{IntTy}),
@@ -2250,9 +2310,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                 0, WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}}))
             .ArgConstraint(NotNull(ArgNo(1))));
 
-    std::optional<QualType> Off_tTy = lookupTy("off_t");
-    std::optional<RangeInt> Off_tMax = getMaxValue(Off_tTy);
-
     // int fseek(FILE *stream, long offset, int whence);
     // FIXME: It can be possible to get the 'SEEK_' values (like EOFv) and use
     // these for condition of arg 2.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index d2b663410580..96784d42d09f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -84,6 +84,7 @@ std::optional<bool> isRefCountable(const CXXRecordDecl* R)
   if (AnyInconclusiveBase)
     return std::nullopt;
 
+  Paths.clear();
   const auto hasPublicDerefInBase =
       [&AnyInconclusiveBase](const CXXBaseSpecifier *Base, CXXBasePath &) {
         auto hasDerefInBase = clang::hasPublicMethodInBase(Base, "deref");
@@ -154,6 +155,7 @@ std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
 
     if (((className == "Ref" || className == "RefPtr") &&
          methodName == "get") ||
+        (className == "Ref" && methodName == "ptr") ||
         ((className == "String" || className == "AtomString" ||
           className == "AtomStringImpl" || className == "UniqueString" ||
           className == "UniqueStringImpl" || className == "Identifier") &&
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 31ccae8b097b..f4e6191cf05a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -91,6 +91,9 @@ public:
 
         const auto *Arg = CE->getArg(ArgIdx);
 
+        if (auto *defaultArg = dyn_cast<CXXDefaultArgExpr>(Arg))
+          Arg = defaultArg->getExpr();
+
         std::pair<const clang::Expr *, bool> ArgOrigin =
             tryToFindPtrOrigin(Arg, true);
 
@@ -125,6 +128,16 @@ public:
     // of object on LHS.
     if (auto *MemberOp = dyn_cast<CXXOperatorCallExpr>(CE)) {
       // Note: assignemnt to built-in type isn't derived from CallExpr.
+      if (MemberOp->getOperator() ==
+          OO_Equal) { // Ignore assignment to Ref/RefPtr.
+        auto *callee = MemberOp->getDirectCallee();
+        if (auto *calleeDecl = dyn_cast<CXXMethodDecl>(callee)) {
+          if (const CXXRecordDecl *classDecl = calleeDecl->getParent()) {
+            if (isRefCounted(classDecl))
+              return true;
+          }
+        }
+      }
       if (MemberOp->isAssignmentOp())
         return false;
     }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index ccc3c0f1e0c1..09c69f9612d9 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1821,6 +1821,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPParallelGenericLoopDirectiveClass:
     case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     case Stmt::CapturedStmtClass:
+    case Stmt::OpenACCComputeConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass: {
       const ExplodedNode *node = Bldr.generateSink(S, Pred, Pred->getState());
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 995d8b2899c8..5a9e563c2d5b 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -430,14 +430,14 @@ void ModuleDepCollectorPP::LexedFileChanged(FileID FID,
 void ModuleDepCollectorPP::InclusionDirective(
     SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName,
     bool IsAngled, CharSourceRange FilenameRange, OptionalFileEntryRef File,
-    StringRef SearchPath, StringRef RelativePath, const Module *Imported,
-    SrcMgr::CharacteristicKind FileType) {
-  if (!File && !Imported) {
+    StringRef SearchPath, StringRef RelativePath, const Module *SuggestedModule,
+    bool ModuleImported, SrcMgr::CharacteristicKind FileType) {
+  if (!File && !ModuleImported) {
     // This is a non-modular include that HeaderSearch failed to find. Add it
     // here as `FileChanged` will never see it.
     MDC.addFileDep(FileName);
   }
-  handleImport(Imported);
+  handleImport(SuggestedModule);
 }
 
 void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc,
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index e14ff34dd737..3c06ab5fbe36 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -verify=ref %s
-// RUN: %clang_cc1 -verify=ref -std=c++20 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s
 
 constexpr int m = 3;
 constexpr const int *foo[][5] = {
@@ -73,53 +73,40 @@ static_assert(getElementFromEnd(data, 5, 0) == 1, "");
 static_assert(getElementFromEnd(data, 5, 4) == 5, "");
 
 constexpr int getFirstElem(const int *a) {
-  return a[0]; // expected-note {{read of dereferenced null pointer}} \
-               // ref-note {{read of dereferenced null pointer}}
+  return a[0]; // both-note {{read of dereferenced null pointer}}
 }
-static_assert(getFirstElem(nullptr) == 1, ""); // expected-error {{not an integral constant expression}} \
-                                               // expected-note {{in call to}} \
-                                               // ref-error {{not an integral constant expression}} \
-                                               // ref-note {{in call to}}
+static_assert(getFirstElem(nullptr) == 1, ""); // both-error {{not an integral constant expression}} \
+                                               // both-note {{in call to}}
 
 constexpr static int arr[2] = {1,2};
 constexpr static int arr2[2] = {3,4};
 constexpr int *p1 = nullptr;
-constexpr int *p2 = p1 + 1; // expected-error {{must be initialized by a constant expression}} \
-                            // expected-note {{cannot perform pointer arithmetic on null pointer}} \
-                            // ref-error {{must be initialized by a constant expression}} \
-                            // ref-note {{cannot perform pointer arithmetic on null pointer}}
+constexpr int *p2 = p1 + 1; // both-error {{must be initialized by a constant expression}} \
+                            // both-note {{cannot perform pointer arithmetic on null pointer}}
 constexpr int *p3 = p1 + 0;
 constexpr int *p4 = p1 - 0;
 constexpr int *p5 =  0 + p1;
-constexpr int *p6 =  0 - p1; // expected-error {{invalid operands to binary expression}} \
-                             // ref-error {{invalid operands to binary expression}}
+constexpr int *p6 =  0 - p1; // both-error {{invalid operands to binary expression}}
 
 constexpr int const * ap1 = &arr[0];
-constexpr int const * ap2 = ap1 + 3; // expected-error {{must be initialized by a constant expression}} \
-                                     // expected-note {{cannot refer to element 3 of array of 2}} \
-                                     // ref-error {{must be initialized by a constant expression}} \
-                                     // ref-note {{cannot refer to element 3 of array of 2}}
-
-constexpr auto ap3 = arr - 1; // expected-error {{must be initialized by a constant expression}} \
-                              // expected-note {{cannot refer to element -1}} \
-                              // ref-error {{must be initialized by a constant expression}} \
-                              // ref-note {{cannot refer to element -1}}
+constexpr int const * ap2 = ap1 + 3; // both-error {{must be initialized by a constant expression}} \
+                                     // both-note {{cannot refer to element 3 of array of 2}}
+
+constexpr auto ap3 = arr - 1; // both-error {{must be initialized by a constant expression}} \
+                              // both-note {{cannot refer to element -1}}
 constexpr int k1 = &arr[1] - &arr[0];
 static_assert(k1 == 1, "");
 static_assert((&arr[0] - &arr[1]) == -1, "");
 
-constexpr int k2 = &arr2[1] - &arr[0]; // expected-error {{must be initialized by a constant expression}} \
-                                       // ref-error {{must be initialized by a constant expression}}
+constexpr int k2 = &arr2[1] - &arr[0]; // both-error {{must be initialized by a constant expression}}
 
 static_assert((arr + 0) == arr, "");
 static_assert(&arr[0] == arr, "");
 static_assert(*(&arr[0]) == 1, "");
 static_assert(*(&arr[1]) == 2, "");
 
-constexpr const int *OOB = (arr + 3) - 3; // expected-error {{must be initialized by a constant expression}} \
-                                          // expected-note {{cannot refer to element 3 of array of 2}} \
-                                          // ref-error {{must be initialized by a constant expression}} \
-                                          // ref-note {{cannot refer to element 3 of array of 2}}
+constexpr const int *OOB = (arr + 3) - 3; // both-error {{must be initialized by a constant expression}} \
+                                          // both-note {{cannot refer to element 3 of array of 2}}
 
 template<typename T>
 constexpr T getElementOf(T* array, int i) {
@@ -135,11 +122,8 @@ constexpr T& getElementOfArray(T (&array)[N], int I) {
 static_assert(getElementOfArray(foo[2], 3) == &m, "");
 
 
-static_assert(data[0] == 4, ""); // expected-error{{failed}} \
-                                 // expected-note{{5 == 4}} \
-                                 // ref-error{{failed}} \
-                                 // ref-note{{5 == 4}}
-
+static_assert(data[0] == 4, ""); // both-error{{failed}} \
+                                 // both-note{{5 == 4}}
 
 constexpr int dynamic[] = {
   f, 3, 2 + 5, data[3], *getElementOf(foo[2], 3)
@@ -185,21 +169,15 @@ struct fred y [] = { [0] = { .s[0] = 'q' } };
 
 namespace indices {
   constexpr int first[] = {1};
-  constexpr int firstValue = first[2]; // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{cannot refer to element 2 of array of 1}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{cannot refer to element 2 of array of 1}}
+  constexpr int firstValue = first[2]; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{cannot refer to element 2 of array of 1}}
 
   constexpr int second[10] = {17};
-  constexpr int secondValue = second[10];// ref-error {{must be initialized by a constant expression}} \
-                                         // ref-note {{read of dereferenced one-past-the-end pointer}} \
-                                         // expected-error {{must be initialized by a constant expression}} \
-                                         // expected-note {{read of dereferenced one-past-the-end pointer}}
-
-  constexpr int negative = second[-2]; // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{cannot refer to element -2 of array of 10}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{cannot refer to element -2 of array of 10}}
+  constexpr int secondValue = second[10];// both-error {{must be initialized by a constant expression}} \
+                                         // both-note {{read of dereferenced one-past-the-end pointer}} \
+
+  constexpr int negative = second[-2]; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{cannot refer to element -2 of array of 10}}
 };
 
 namespace DefaultInit {
@@ -222,12 +200,9 @@ public:
 class AU {
 public:
   int a;
-  constexpr AU() : a(5 / 0) {} // expected-warning {{division by zero is undefined}} \
-                               // expected-note 2{{division by zero}} \
-                               // expected-error {{never produces a constant expression}} \
-                               // ref-error {{never produces a constant expression}} \
-                               // ref-note 2{{division by zero}} \
-                               // ref-warning {{division by zero is undefined}}
+  constexpr AU() : a(5 / 0) {} // both-warning {{division by zero is undefined}} \
+                               // both-note 2{{division by zero}} \
+                               // both-error {{never produces a constant expression}}
 };
 class B {
 public:
@@ -241,13 +216,10 @@ static_assert(b.a[1].a == 12, "");
 class BU {
 public:
   AU a[2];
-  constexpr BU() {} // expected-note {{in call to 'AU()'}} \
-                    // ref-note {{in call to 'AU()'}}
+  constexpr BU() {} // both-note {{in call to 'AU()'}}
 };
-constexpr BU bu; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{in call to 'BU()'}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{in call to 'BU()'}}
+constexpr BU bu; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{in call to 'BU()'}}
 
 namespace IncDec {
   constexpr int getNextElem(const int *A, int I) {
@@ -311,62 +283,43 @@ namespace IncDec {
   }
   static_assert(getSecondToLast2() == 3, "");
 
-  constexpr int bad1() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad1() { // both-error {{never produces a constant expression}}
     const int *e =  E + 3;
     e++; // This is fine because it's a one-past-the-end pointer
-    return *e; // expected-note 2{{read of dereferenced one-past-the-end pointer}} \
-               // ref-note 2{{read of dereferenced one-past-the-end pointer}}
+    return *e; // both-note 2{{read of dereferenced one-past-the-end pointer}}
   }
-  static_assert(bad1() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to}} \
-                                  // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+  static_assert(bad1() == 0, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to}}
 
-  constexpr int bad2() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad2() { // both-error {{never produces a constant expression}}
     const int *e = E + 4;
-    e++; // expected-note 2{{cannot refer to element 5 of array of 4 elements}} \
-         // ref-note 2{{cannot refer to element 5 of array of 4 elements}}
+    e++; // both-note 2{{cannot refer to element 5 of array of 4 elements}}
     return *e; // This is UB as well
   }
-  static_assert(bad2() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to}} \
-                                  // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+  static_assert(bad2() == 0, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to}}
 
-
-  constexpr int bad3() { // ref-error {{never produces a constant expression}} \
-                         // expected-error {{never produces a constant expression}}
+  constexpr int bad3() { // both-error {{never produces a constant expression}}
     const int *e = E;
-    e--; // expected-note 2{{cannot refer to element -1 of array of 4 elements}} \
-         // ref-note 2{{cannot refer to element -1 of array of 4 elements}}
+    e--; // both-note 2{{cannot refer to element -1 of array of 4 elements}}
     return *e; // This is UB as well
   }
-   static_assert(bad3() == 0, ""); // expected-error {{not an integral constant expression}} \
-                                   // expected-note {{in call to}} \
-                                   // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to}}
+   static_assert(bad3() == 0, ""); // both-error {{not an integral constant expression}} \
+                                   // both-note {{in call to}}
 
   constexpr int nullptr1(bool Pre) {
     int *a = nullptr;
     if (Pre)
-      ++a; // ref-note {{arithmetic on null pointer}} \
-           // expected-note {{arithmetic on null pointer}}
+      ++a; // both-note {{arithmetic on null pointer}}
     else
-      a++; // ref-note {{arithmetic on null pointer}} \
-           // expected-note {{arithmetic on null pointer}}
+      a++; // both-note {{arithmetic on null pointer}}
     return 1;
   }
-  static_assert(nullptr1(true) == 1, ""); // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}} \
-                                          // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}}
-
-  static_assert(nullptr1(false) == 1, ""); // ref-error {{not an integral constant expression}} \
-                                           // ref-note {{in call to}} \
-                                           // expected-error {{not an integral constant expression}} \
-                                           // expected-note {{in call to}}
+  static_assert(nullptr1(true) == 1, ""); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
+
+  static_assert(nullptr1(false) == 1, ""); // both-error {{not an integral constant expression}} \
+                                           // both-note {{in call to}}
 };
 
 namespace ZeroInit {
@@ -425,28 +378,20 @@ namespace NoInitMapLeak {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdivision-by-zero"
 #pragma clang diagnostic ignored "-Wc++20-extensions"
-  constexpr int testLeak() { // expected-error {{never produces a constant expression}} \
-                             // ref-error {{never produces a constant expression}}
+  constexpr int testLeak() { // both-error {{never produces a constant expression}}
     int a[2];
     a[0] = 1;
     // interrupts interpretation.
-    (void)(1 / 0); // expected-note 2{{division by zero}} \
-                   // ref-note 2{{division by zero}}
-
+    (void)(1 / 0); // both-note 2{{division by zero}}
 
     return 1;
   }
 #pragma clang diagnostic pop
-  static_assert(testLeak() == 1, ""); // expected-error {{not an integral constant expression}} \
-                                      // expected-note {{in call to 'testLeak()'}} \
-                                      // ref-error {{not an integral constant expression}} \
-                                      // ref-note {{in call to 'testLeak()'}}
-
+  static_assert(testLeak() == 1, ""); // both-error {{not an integral constant expression}} \
+                                      // both-note {{in call to 'testLeak()'}}
 
-  constexpr int a[] = {1,2,3,4/0,5}; // expected-error {{must be initialized by a constant expression}} \
-                                     // expected-note {{division by zero}} \
-                                     // ref-error {{must be initialized by a constant expression}} \
-                                     // ref-note {{division by zero}} \
+  constexpr int a[] = {1,2,3,4/0,5}; // both-error {{must be initialized by a constant expression}} \
+                                     // both-note {{division by zero}} \
                                      // ref-note {{declared here}}
 
   /// FIXME: This should fail in the new interpreter as well.
@@ -456,18 +401,13 @@ namespace NoInitMapLeak {
   static_assert(b == 1, ""); // ref-error {{not an integral constant expression}} \
                              // ref-note {{not a constant expression}}
 
-  constexpr int f() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
-    int a[] = {19,2,3/0,4}; // expected-note 2{{division by zero}} \
-                            // expected-warning {{is undefined}} \
-                            // ref-note 2{{division by zero}} \
-                            // ref-warning {{is undefined}}
+  constexpr int f() { // both-error {{never produces a constant expression}}
+    int a[] = {19,2,3/0,4}; // both-note 2{{division by zero}} \
+                            // both-warning {{is undefined}}
     return 1;
   }
-  static_assert(f() == 1, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to}}
+  static_assert(f() == 1, ""); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to}}
 }
 
 namespace Incomplete {
@@ -477,38 +417,27 @@ namespace Incomplete {
   };
 
   constexpr Foo F{};
-  constexpr const int *A = F.a; // ref-error {{must be initialized by a constant expression}} \
-                                // ref-note {{array-to-pointer decay of array member without known bound}} \
-                                // expected-error {{must be initialized by a constant expression}} \
-                                // expected-note {{array-to-pointer decay of array member without known bound}}
-
-  constexpr const int *B = F.a + 1; // ref-error {{must be initialized by a constant expression}} \
-                                    // ref-note {{array-to-pointer decay of array member without known bound}} \
-                                    // expected-error {{must be initialized by a constant expression}} \
-                                    // expected-note {{array-to-pointer decay of array member without known bound}}
-
-  constexpr int C = *F.a; // ref-error {{must be initialized by a constant expression}} \
-                          // ref-note {{array-to-pointer decay of array member without known bound}} \
-                          // expected-error {{must be initialized by a constant expression}} \
-                          // expected-note {{array-to-pointer decay of array member without known bound}}
+  constexpr const int *A = F.a; // both-error {{must be initialized by a constant expression}} \
+                                // both-note {{array-to-pointer decay of array member without known bound}}
 
+  constexpr const int *B = F.a + 1; // both-error {{must be initialized by a constant expression}} \
+                                    // both-note {{array-to-pointer decay of array member without known bound}}
 
+  constexpr int C = *F.a; // both-error {{must be initialized by a constant expression}} \
+                          // both-note {{array-to-pointer decay of array member without known bound}}
 
   /// These are from test/SemaCXX/constant-expression-cxx11.cpp
   /// and are the only tests using the 'indexing of array without known bound' diagnostic.
   /// We currently diagnose them differently.
   extern int arr[]; // expected-note 3{{declared here}}
-  constexpr int *c = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *c = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
-  constexpr int *d = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *d = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
-  constexpr int *e = arr + 1; // ref-error  {{must be initialized by a constant expression}} \
+  constexpr int *e = arr + 1; // both-error  {{must be initialized by a constant expression}} \
                               // ref-note {{indexing of array without known bound}} \
-                              // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
 }
 
@@ -528,8 +457,7 @@ namespace GH69115 {
     if (C)
       return;
     // Invalid in constexpr.
-    (void)(1 / 0); // expected-warning {{undefined}} \
-                   // ref-warning {{undefined}}
+    (void)(1 / 0); // both-warning {{undefined}}
   }
 
   class F {
@@ -569,23 +497,15 @@ namespace GH69115 {
 
 namespace NonConstReads {
 #if __cplusplus >= 202002L
-  void *p = nullptr; // ref-note {{declared here}} \
-                     // expected-note {{declared here}}
-
-  int arr[!p]; // ref-error {{not allowed at file scope}} \
-               // expected-error {{not allowed at file scope}} \
-               // ref-warning {{variable length arrays}} \
-               // ref-note {{read of non-constexpr variable 'p'}} \
-               // expected-warning {{variable length arrays}} \
-               // expected-note {{read of non-constexpr variable 'p'}}
-  int z; // ref-note {{declared here}} \
-         // expected-note {{declared here}}
-  int a[z]; // ref-error {{not allowed at file scope}} \
-            // expected-error {{not allowed at file scope}} \
-            // ref-warning {{variable length arrays}} \
-            // ref-note {{read of non-const variable 'z'}} \
-            // expected-warning {{variable length arrays}} \
-            // expected-note {{read of non-const variable 'z'}}
+  void *p = nullptr; // both-note {{declared here}}
+
+  int arr[!p]; // both-error {{not allowed at file scope}} \
+               // both-warning {{variable length arrays}} \
+               // both-note {{read of non-constexpr variable 'p'}}
+  int z; // both-note {{declared here}}
+  int a[z]; // both-error {{not allowed at file scope}} \
+            // both-warning {{variable length arrays}} \
+            // both-note {{read of non-const variable 'z'}}
 #else
   void *p = nullptr;
   int arr[!p]; // ref-error {{not allowed at file scope}} \
@@ -598,3 +518,22 @@ namespace NonConstReads {
   const int y = 0;
   int yy[y];
 }
+
+namespace SelfComparison {
+  struct S {
+    int field;
+    static int static_field;
+    int array[4];
+  };
+
+  struct T {
+    int field;
+    static int static_field;
+    int array[4];
+    S s;
+  };
+
+  int struct_test(S s1, S s2, S *s3, T t) {
+    return s3->array[t.field] == s3->array[t.field];  // both-warning {{self-comparison always evaluates to true}}
+  };
+}
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index d6ed2d862b09..3aa01d501a3e 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify
-// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify
-// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
-// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both
+// RUN: %clang_cc1 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter -triple i686 %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
+// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
 
 
 namespace strcmp {
@@ -23,23 +23,17 @@ namespace strcmp {
   static_assert(__builtin_strcmp("abab\0banana", "abab") == 0, "");
   static_assert(__builtin_strcmp("abab", "abab\0banana") == 0, "");
   static_assert(__builtin_strcmp("abab\0banana", "abab\0canada") == 0, "");
-  static_assert(__builtin_strcmp(0, "abab") == 0, ""); // expected-error {{not an integral constant}} \
-                                                       // expected-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}} \
-                                                       // ref-error {{not an integral constant}} \
-                                                       // ref-note {{dereferenced null}}
-  static_assert(__builtin_strcmp("abab", 0) == 0, ""); // expected-error {{not an integral constant}} \
-                                                       // expected-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}} \
-                                                       // ref-error {{not an integral constant}} \
-                                                       // ref-note {{dereferenced null}}
+  static_assert(__builtin_strcmp(0, "abab") == 0, ""); // both-error {{not an integral constant}} \
+                                                       // both-note {{dereferenced null}} \
+                                                       // expected-note {{in call to}}
+  static_assert(__builtin_strcmp("abab", 0) == 0, ""); // both-error {{not an integral constant}} \
+                                                       // both-note {{dereferenced null}} \
+                                                       // expected-note {{in call to}}
 
   static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar) == -1, "");
-  static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // expected-error {{not an integral constant}} \
-                                                                        // expected-note {{dereferenced one-past-the-end}} \
-                                                                        // expected-note {{in call to}} \
-                                                                        // ref-error {{not an integral constant}} \
-                                                                        // ref-note {{dereferenced one-past-the-end}}
+  static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // both-error {{not an integral constant}} \
+                                                                        // both-note {{dereferenced one-past-the-end}} \
+                                                                        // expected-note {{in call to}}
 }
 
 /// Copied from constant-expression-cxx11.cpp
@@ -69,41 +63,27 @@ constexpr const char *a = "foo\0quux";
   static_assert(check(b), "");
   static_assert(check(c), "");
 
-  constexpr int over1 = __builtin_strlen(a + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-  constexpr int over2 = __builtin_strlen(b + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-  constexpr int over3 = __builtin_strlen(c + 9); // expected-error {{constant expression}} \
-                                                 // expected-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}} \
-                                                 // ref-error {{constant expression}} \
-                                                 // ref-note {{one-past-the-end}}
-
-  constexpr int under1 = __builtin_strlen(a - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
-  constexpr int under2 = __builtin_strlen(b - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
-  constexpr int under3 = __builtin_strlen(c - 1); // expected-error {{constant expression}} \
-                                                  // expected-note {{cannot refer to element -1}} \
-                                                  // ref-error {{constant expression}} \
-                                                  // ref-note {{cannot refer to element -1}}
+  constexpr int over1 = __builtin_strlen(a + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+  constexpr int over2 = __builtin_strlen(b + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+  constexpr int over3 = __builtin_strlen(c + 9); // both-error {{constant expression}} \
+                                                 // both-note {{one-past-the-end}} \
+                                                 // expected-note {{in call to}}
+
+  constexpr int under1 = __builtin_strlen(a - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
+  constexpr int under2 = __builtin_strlen(b - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
+  constexpr int under3 = __builtin_strlen(c - 1); // both-error {{constant expression}} \
+                                                  // both-note {{cannot refer to element -1}}
 
   constexpr char d[] = { 'f', 'o', 'o' }; // no nul terminator.
-  constexpr int bad = __builtin_strlen(d); // expected-error {{constant expression}} \
-                                           // expected-note {{one-past-the-end}} \
-                                           // expected-note {{in call to}} \
-                                           // ref-error {{constant expression}} \
-                                           // ref-note {{one-past-the-end}}
+  constexpr int bad = __builtin_strlen(d); // both-error {{constant expression}} \
+                                           // both-note {{one-past-the-end}} \
+                                           // expected-note {{in call to}}
 }
 
 namespace nan {
@@ -115,8 +95,7 @@ namespace nan {
   // expected-error@-2 {{must be initialized by a constant expression}}
 #endif
 
-  constexpr double NaN3 = __builtin_nan("foo"); // expected-error {{must be initialized by a constant expression}} \
-                                                // ref-error {{must be initialized by a constant expression}}
+  constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
   //constexpr long double NaN5 = __builtin_nanf128("");
 
@@ -126,8 +105,7 @@ namespace nan {
 
   /// FIXME: Current interpreter misses diagnostics.
   constexpr char f2[] = {'0', 'x', 'A', 'E'}; /// No trailing 0 byte.
-  constexpr double NaN7 = __builtin_nan(f2); // ref-error {{must be initialized by a constant expression}} \
-                                             // expected-error {{must be initialized by a constant expression}} \
+  constexpr double NaN7 = __builtin_nan(f2); // both-error {{must be initialized by a constant expression}} \
                                              // expected-note {{read of dereferenced one-past-the-end pointer}} \
                                              // expected-note {{in call to}}
   static_assert(!__builtin_issignaling(__builtin_nan("")), "");
@@ -370,9 +348,6 @@ namespace EhReturnDataRegno {
       case __builtin_eh_return_data_regno(0):  // constant foldable.
       break;
     }
-
-    __builtin_eh_return_data_regno(X);  // expected-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}} \
-                                        // ref-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}}
-
+    __builtin_eh_return_data_regno(X);  // both-error {{argument to '__builtin_eh_return_data_regno' must be a constant integer}}
   }
 }
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 9ab271a82aee..85c195d33a96 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify -std=c11 %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected -std=c11 %s
-// RUN: %clang_cc1 -verify=ref -std=c11 %s
-// RUN: %clang_cc1 -pedantic -verify=pedantic-ref -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=expected,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -pedantic -verify=pedantic-expected,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify=ref,all -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-linux -pedantic -verify=pedantic-ref,all -std=c11 %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
@@ -22,11 +22,9 @@ _Static_assert(!!1.0, ""); // pedantic-ref-warning {{not an integer constant exp
 _Static_assert(!!1, "");
 
 int a = (1 == 1 ? 5 : 3);
-_Static_assert(a == 5, ""); // ref-error {{not an integral constant expression}} \
-                            // pedantic-ref-error {{not an integral constant expression}} \
-                            // expected-error {{not an integral constant expression}} \
-                            // pedantic-expected-error {{not an integral constant expression}}
+_Static_assert(a == 5, ""); // all-error {{not an integral constant expression}}
 
+const int DiscardedPtrToIntCast = ((intptr_t)((void*)0), 0); // all-warning {{left operand of comma operator has no effect}}
 
 const int b = 3;
 _Static_assert(b == 3, ""); // pedantic-ref-warning {{not an integer constant expression}} \
@@ -67,25 +65,17 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio
 /// extern variable of a composite type.
 /// FIXME: The 'cast from void*' note is missing in the new interpreter.
 extern struct Test50S Test50;
-_Static_assert(&Test50 != (void*)0, ""); // ref-warning {{always true}} \
-                                         // pedantic-ref-warning {{always true}} \
+_Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \
                                          // pedantic-ref-warning {{is a GNU extension}} \
                                          // pedantic-ref-note {{cast from 'void *' is not allowed}} \
-                                         // expected-warning {{always true}} \
-                                         // pedantic-expected-warning {{always true}} \
                                          // pedantic-expected-warning {{is a GNU extension}}
 
 struct y {int x,y;};
-int a2[(intptr_t)&((struct y*)0)->y]; // expected-warning {{folded to constant array}} \
-                                      // pedantic-expected-warning {{folded to constant array}} \
-                                      // ref-warning {{folded to constant array}} \
-                                      // pedantic-ref-warning {{folded to constant array}}
+int a2[(intptr_t)&((struct y*)0)->y]; // all-warning {{folded to constant array}}
 
 const struct y *yy = (struct y*)0;
-const intptr_t L = (intptr_t)(&(yy->y)); // expected-error {{not a compile-time constant}} \
-                                         // pedantic-expected-error {{not a compile-time constant}} \
-                                         // ref-error {{not a compile-time constant}} \
-                                         // pedantic-ref-error {{not a compile-time constant}}
+const intptr_t L = (intptr_t)(&(yy->y)); // all-error {{not a compile-time constant}}
+
 const ptrdiff_t m = &m + 137 - &m;
 _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \
                               // pedantic-expected-warning {{GNU extension}}
@@ -93,10 +83,7 @@ _Static_assert(m == 137, ""); // pedantic-ref-warning {{GNU extension}} \
 /// from test/Sema/switch.c, used to cause an assertion failure.
 void f (int z) {
   while (z) {
-    default: z--; // expected-error {{'default' statement not in switch}} \
-                  // pedantic-expected-error {{'default' statement not in switch}} \
-                  // ref-error {{'default' statement not in switch}} \
-                  // pedantic-ref-error {{'default' statement not in switch}}
+    default: z--; // all-error {{'default' statement not in switch}}
   }
 }
 
@@ -104,15 +91,8 @@ int expr;
 int chooseexpr[__builtin_choose_expr(1, 1, expr)];
 
 int somefunc(int i) {
-  return (i, 65537) * 65537; // expected-warning {{left operand of comma operator has no effect}} \
-                             // expected-warning {{overflow in expression; result is 131073}} \
-                             // pedantic-expected-warning {{left operand of comma operator has no effect}} \
-                             // pedantic-expected-warning {{overflow in expression; result is 131073}} \
-                             // ref-warning {{left operand of comma operator has no effect}} \
-                             // ref-warning {{overflow in expression; result is 131073}} \
-                             // pedantic-ref-warning {{left operand of comma operator has no effect}} \
-                             // pedantic-ref-warning {{overflow in expression; result is 131073}}
-
+  return (i, 65537) * 65537; // all-warning {{left operand of comma operator has no effect}} \
+                             // all-warning {{overflow in expression; result is 131073}}
 }
 
 /// FIXME: The following test is incorrect in the new interpreter.
@@ -129,3 +109,19 @@ _Static_assert(sizeof(name2) == 0, ""); // expected-error {{failed}} \
                                         // expected-note {{evaluates to}} \
                                         // pedantic-expected-error {{failed}} \
                                         // pedantic-expected-note {{evaluates to}}
+
+#ifdef __SIZEOF_INT128__
+void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // all-warning {{refers past the last possible element}}
+#endif
+
+extern float global_float;
+struct XX { int a, *b; };
+struct XY { int before; struct XX xx, *xp; float* after; } xy[] = {
+  0, 0, &xy[0].xx.a, &xy[0].xx, &global_float,
+  [1].xx = 0, &xy[1].xx.a, &xy[1].xx, &global_float,
+  0,              // all-note {{previous initialization is here}}
+  0,              // all-note {{previous initialization is here}}
+  [2].before = 0, // all-warning {{initializer overrides prior initialization of this subobject}}
+  0,              // all-warning {{initializer overrides prior initialization of this subobject}}
+  &xy[2].xx.a, &xy[2].xx, &global_float
+};
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
new file mode 100644
index 000000000000..b07d0241da12
--- /dev/null
+++ b/clang/test/AST/Interp/complex.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both -Wno-unused-value %s
+// RUN: %clang_cc1 -verify=ref,both -Wno-unused-value %s
+
+// expected-no-diagnostics
+// ref-no-diagnostics
+
+void blah() {
+  __complex__ unsigned xx;
+  __complex__ signed yy;
+  __complex__ int result;
+
+  /// The following line calls into the constant interpreter.
+  result = xx * yy;
+}
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 7d625ab1f378..9fdaabd9081d 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -98,8 +98,9 @@ constexpr _Complex int I3 = {15};
 static_assert(__real(I3) == 15, "");
 static_assert(__imag(I3) == 0, "");
 
-/// FIXME: This should work in the new interpreter as well.
-// constexpr _Complex _BitInt(8) A = 0;// = {4};
+constexpr _Complex _BitInt(8) A = {4};
+static_assert(__real(A) == 4, "");
+static_assert(__imag(A) == 0, "");
 
 
 constexpr _Complex double Doubles[4] = {{1.0, 2.0}};
diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp
index f8400898acc0..a433e5666e4f 100644
--- a/clang/test/AST/Interp/lambda.cpp
+++ b/clang/test/AST/Interp/lambda.cpp
@@ -155,6 +155,19 @@ namespace StaticInvoker {
     return fp(i).a;
   }
   static_assert(sv6(12) == 12);
+
+
+  /// A generic lambda.
+  auto GL = [](auto a) { return a; };
+  constexpr char (*fp2)(char) = GL;
+  static_assert(fp2('3') == '3', "");
+
+  struct GLS {
+    int a;
+  };
+  auto GL2 = [](auto a) { return GLS{a}; };
+  constexpr GLS (*fp3)(char) = GL2;
+  static_assert(fp3('3').a == '3', "");
 }
 
 namespace LambdasAsParams {
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index f5b5f77ffc62..bc994c3191ce 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -915,6 +915,13 @@ static_assert(ignoredDecls() == 12, "");
 namespace DiscardExprs {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-value"
+  typedef struct _GUID {
+    __UINT32_TYPE__ Data1;
+    __UINT16_TYPE__ Data2;
+    __UINT16_TYPE__ Data3;
+    __UINT8_TYPE__ Data4[8];
+  } GUID;
+  class __declspec(uuid("000000A0-0000-0000-C000-000000000049")) GuidType;
 
   struct A{ int a; };
   constexpr int ignoredExprs() {
@@ -951,6 +958,8 @@ namespace DiscardExprs {
     (float)1;
     (double)1.0f;
     (signed)4u;
+    __uuidof(GuidType);
+    __uuidof(number); // both-error {{cannot call operator __uuidof on a type with no GUID}}
 
     return 0;
   }
@@ -1105,3 +1114,13 @@ namespace NonConstReads {
   static_assert(z == 0, ""); // both-error {{not an integral constant expression}} \
                              // both-note {{read of non-const variable 'z'}}
 }
+
+/// This test passes a MaterializedTemporaryExpr to evaluateAsRValue.
+/// That needs to return a null pointer after the lvalue-to-rvalue conversion.
+/// We used to fail to do that.
+namespace rdar8769025 {
+  __attribute__((nonnull)) void f1(int * const &p);
+  void test_f1() {
+    f1(0); // both-warning{{null passed to a callee that requires a non-null argument}}
+  }
+}
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 5ce1e6e09a0b..93da831f3bda 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify %s
-// RUN: %clang_cc1 -verify=ref %s
-// RUN: %clang_cc1 -verify=ref -std=c++14 %s
-// RUN: %clang_cc1 -verify=ref -std=c++20 %s
-// RUN: %clang_cc1 -verify=ref -triple i686 %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple i686 -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++14 %s
+// RUN: %clang_cc1 -verify=ref,both -std=c++20 %s
+// RUN: %clang_cc1 -verify=ref,both -triple i686 %s
 
 /// Used to crash.
 struct Empty {};
@@ -90,9 +90,8 @@ struct Ints2 {
   int a = 10;
   int b;
 };
-constexpr Ints2 ints22; // expected-error {{without a user-provided default constructor}} \
-                        // expected-error {{must be initialized by a constant expression}} \
-                        // ref-error {{without a user-provided default constructor}}
+constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \
+                        // expected-error {{must be initialized by a constant expression}}
 
 constexpr Ints2 I2 = Ints2{12, 25};
 static_assert(I2.a == 12, "");
@@ -164,17 +163,13 @@ constexpr C RVOAndParams(int a) {
 }
 constexpr C RVOAndParamsResult2 = RVOAndParams(12);
 
-class Bar { // expected-note {{definition of 'Bar' is not complete}} \
-            // ref-note {{definition of 'Bar' is not complete}}
+class Bar { // both-note {{definition of 'Bar' is not complete}}
 public:
   constexpr Bar(){}
-  constexpr Bar b; // expected-error {{cannot be constexpr}} \
-                   // expected-error {{has incomplete type 'const Bar'}} \
-                   // ref-error {{cannot be constexpr}} \
-                   // ref-error {{has incomplete type 'const Bar'}}
+  constexpr Bar b; // both-error {{cannot be constexpr}} \
+                   // both-error {{has incomplete type 'const Bar'}}
 };
-constexpr Bar B; // expected-error {{must be initialized by a constant expression}} \
-                 // ref-error {{must be initialized by a constant expression}}
+constexpr Bar B; // both-error {{must be initialized by a constant expression}}
 constexpr Bar *pb = nullptr;
 
 constexpr int locals() {
@@ -198,17 +193,13 @@ namespace thisPointer {
     constexpr int get12() { return 12; }
   };
 
-  constexpr int foo() { // ref-error {{never produces a constant expression}} \
-                        // expected-error {{never produces a constant expression}}
+  constexpr int foo() { // both-error {{never produces a constant expression}}
     S *s = nullptr;
-    return s->get12(); // ref-note 2{{member call on dereferenced null pointer}} \
-                       // expected-note 2{{member call on dereferenced null pointer}}
+    return s->get12(); // both-note 2{{member call on dereferenced null pointer}}
 
   }
-  static_assert(foo() == 12, ""); // ref-error {{not an integral constant expression}} \
-                                  // ref-note {{in call to 'foo()'}} \
-                                  // expected-error {{not an integral constant expression}} \
-                                  // expected-note {{in call to 'foo()'}}
+  static_assert(foo() == 12, ""); // both-error {{not an integral constant expression}} \
+                                  // both-note {{in call to 'foo()'}}
 };
 
 struct FourBoolPairs {
@@ -244,20 +235,16 @@ constexpr A a{};
 static_assert(a.i == 100, "");
 constexpr A a2{12};
 static_assert(a2.i == 12, "");
-static_assert(a2.i == 200, ""); // ref-error {{static assertion failed}} \
-                                // ref-note {{evaluates to '12 == 200'}} \
-                                // expected-error {{static assertion failed}} \
-                                // expected-note {{evaluates to '12 == 200'}}
+static_assert(a2.i == 200, ""); // both-error {{static assertion failed}} \
+                                // both-note {{evaluates to '12 == 200'}}
 
 
 struct S {
   int a = 0;
   constexpr int get5() const { return 5; }
   constexpr void fo() const {
-    this; // expected-warning {{expression result unused}} \
-          // ref-warning {{expression result unused}}
-    this->a; // expected-warning {{expression result unused}} \
-             // ref-warning {{expression result unused}}
+    this; // both-warning {{expression result unused}}
+    this->a; // both-warning {{expression result unused}}
     get5();
     getInts();
   }
@@ -342,12 +329,9 @@ namespace InitializerTemporaries {
   // Invalid destructor.
   struct S {
     constexpr S() {}
-    constexpr ~S() noexcept(false) { throw 12; } // expected-error {{cannot use 'throw'}} \
-                                                 // expected-error {{never produces a constant expression}} \
-                                                 // expected-note 2{{subexpression not valid}} \
-                                                 // ref-error {{cannot use 'throw'}} \
-                                                 // ref-error {{never produces a constant expression}} \
-                                                 // ref-note 2{{subexpression not valid}}
+    constexpr ~S() noexcept(false) { throw 12; } // both-error {{cannot use 'throw'}} \
+                                                 // both-error {{never produces a constant expression}} \
+                                                 // both-note 2{{subexpression not valid}}
   };
 
   constexpr int f() {
@@ -355,10 +339,8 @@ namespace InitializerTemporaries {
     /// FIXME: Wrong source location below.
     return 12; // expected-note {{in call to '&S{}->~S()'}}
   }
-  static_assert(f() == 12); // expected-error {{not an integral constant expression}} \
-                            // expected-note {{in call to 'f()'}} \
-                            // ref-error {{not an integral constant expression}} \
-                            // ref-note {{in call to 'f()'}}
+  static_assert(f() == 12); // both-error {{not an integral constant expression}} \
+                            // both-note {{in call to 'f()'}}
 
 
 #endif
@@ -423,7 +405,8 @@ namespace MI {
 
 namespace DeriveFailures {
 #if __cplusplus < 202002L
-  struct Base { // ref-note 2{{declared here}} expected-note {{declared here}}
+  struct Base { // both-note {{declared here}} \
+                // ref-note {{declared here}}
     int Val;
   };
 
@@ -431,35 +414,29 @@ namespace DeriveFailures {
     int OtherVal;
 
     constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \
-                                              // ref-note 2{{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
-                                              // expected-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}}
+                                              // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
+                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} 
   };
 
-  constexpr Derived D(12); // ref-error {{must be initialized by a constant expression}} \
-                           // ref-note {{in call to 'Derived(12)'}} \
-                           // ref-note {{declared here}} \
-                           // expected-error {{must be initialized by a constant expression}} \
-                           // expected-note {{in call to 'Derived(12)'}}
+  constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \
+                           // both-note {{in call to 'Derived(12)'}} \
+                           // ref-note {{declared here}}
 
-  static_assert(D.Val == 0, ""); // ref-error {{not an integral constant expression}} \
+  static_assert(D.Val == 0, ""); // both-error {{not an integral constant expression}} \
                                  // ref-note {{initializer of 'D' is not a constant expression}} \
-                                 // expected-error {{not an integral constant expression}} \
                                  // expected-note {{read of uninitialized object}}
 #endif
 
   struct AnotherBase {
     int Val;
-    constexpr AnotherBase(int i) : Val(12 / i) {} //ref-note {{division by zero}} \
-                                                  //expected-note {{division by zero}}
+    constexpr AnotherBase(int i) : Val(12 / i) {} // both-note {{division by zero}}
   };
 
   struct AnotherDerived : AnotherBase {
     constexpr AnotherDerived(int i) : AnotherBase(i) {}
   };
-  constexpr AnotherBase Derp(0); // ref-error {{must be initialized by a constant expression}} \
-                                 // ref-note {{in call to 'AnotherBase(0)'}} \
-                                 // expected-error {{must be initialized by a constant expression}} \
-                                 // expected-note {{in call to 'AnotherBase(0)'}}
+  constexpr AnotherBase Derp(0); // both-error {{must be initialized by a constant expression}} \
+                                 // both-note {{in call to 'AnotherBase(0)'}}
 
   struct YetAnotherBase {
     int Val;
@@ -467,17 +444,14 @@ namespace DeriveFailures {
   };
 
   struct YetAnotherDerived : YetAnotherBase {
-    using YetAnotherBase::YetAnotherBase; // ref-note {{declared here}} \
-                                          // expected-note {{declared here}}
+    using YetAnotherBase::YetAnotherBase; // both-note {{declared here}}
     int OtherVal;
 
     constexpr bool doit() const { return Val == OtherVal; }
   };
 
-  constexpr YetAnotherDerived Oops(0); // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}} \
-                                       // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}}
+  constexpr YetAnotherDerived Oops(0); // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{constructor inherited from base class 'YetAnotherBase' cannot be used in a constant expression}}
 };
 
 namespace EmptyCtor {
@@ -543,18 +517,10 @@ namespace PointerArith {
   constexpr B *b1 = &b + 1;
   constexpr B *b2 = &b + 0;
 
-#if 0
-  constexpr A *a2 = &b + 1; // expected-error {{must be initialized by a constant expression}} \
-                            // expected-note {{cannot access base class of pointer past the end of object}} \
-                            // ref-error {{must be initialized by a constant expression}} \
-                            // ref-note {{cannot access base class of pointer past the end of object}}
-
-#endif
-  constexpr const int *pn = &(&b + 1)->n; // expected-error {{must be initialized by a constant expression}} \
-                                          // expected-note {{cannot access field of pointer past the end of object}} \
-                                          // ref-error {{must be initialized by a constant expression}} \
-                                          // ref-note {{cannot access field of pointer past the end of object}}
-
+  constexpr A *a2 = &b + 1; // both-error {{must be initialized by a constant expression}} \
+                            // both-note {{cannot access base class of pointer past the end of object}}
+  constexpr const int *pn = &(&b + 1)->n; // both-error {{must be initialized by a constant expression}} \
+                                          // both-note {{cannot access field of pointer past the end of object}}
 }
 
 #if __cplusplus >= 202002L
@@ -632,12 +598,9 @@ namespace Destructors {
 
   struct S {
     constexpr S() {}
-    constexpr ~S() { // expected-error {{never produces a constant expression}} \
-                     // ref-error {{never produces a constant expression}}
-      int i = 1 / 0; // expected-warning {{division by zero}} \
-                     // expected-note 2{{division by zero}} \
-                     // ref-warning {{division by zero}} \
-                     // ref-note 2{{division by zero}}
+    constexpr ~S() { // both-error {{never produces a constant expression}}
+      int i = 1 / 0; // both-warning {{division by zero}} \
+                     // both-note 2{{division by zero}}
     }
   };
   constexpr int testS() {
@@ -645,10 +608,8 @@ namespace Destructors {
     return 1; // expected-note {{in call to '&S{}->~S()'}}
               // FIXME: ^ Wrong line
   }
-  static_assert(testS() == 1); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'testS()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'testS()'}}
+  static_assert(testS() == 1); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to 'testS()'}}
 }
 
 namespace BaseToDerived {
@@ -657,10 +618,8 @@ namespace A {
   struct B : A { int n; };
   struct C : B {};
   C c = {};
-  constexpr C *pb = (C*)((A*)&c + 1); // expected-error {{must be initialized by a constant expression}} \
-                                      // expected-note {{cannot access derived class of pointer past the end of object}} \
-                                      // ref-error {{must be initialized by a constant expression}} \
-                                      // ref-note {{cannot access derived class of pointer past the end of object}}
+  constexpr C *pb = (C*)((A*)&c + 1); // both-error {{must be initialized by a constant expression}} \
+                                      // both-note {{cannot access derived class of pointer past the end of object}}
 }
 namespace B {
   struct A {};
@@ -894,10 +853,8 @@ namespace VirtualFromBase {
   // Virtual f(), not OK.
   constexpr X<X<S2>> xxs2;
   constexpr X<S2> *q = const_cast<X<X<S2>>*>(&xxs2);
-  static_assert(q->f() == sizeof(X<S2>), ""); // ref-error {{not an integral constant expression}} \
-                                              // ref-note {{cannot evaluate call to virtual function}} \
-                                              // expected-error {{not an integral constant expression}} \
-                                              // expected-note {{cannot evaluate call to virtual function}}
+  static_assert(q->f() == sizeof(X<S2>), ""); // both-error {{not an integral constant expression}} \
+                                              // both-note {{cannot evaluate call to virtual function}}
 }
 #endif
 
@@ -1070,14 +1027,10 @@ namespace ParenInit {
 
   /// Not constexpr!
   O o1(0);
-  constinit O o2(0); // ref-error {{variable does not have a constant initializer}} \
-                     // ref-note {{required by 'constinit' specifier}} \
-                     // ref-note {{reference to temporary is not a constant expression}} \
-                     // ref-note {{temporary created here}} \
-                     // expected-error {{variable does not have a constant initializer}} \
-                     // expected-note {{required by 'constinit' specifier}} \
-                     // expected-note {{reference to temporary is not a constant expression}} \
-                     // expected-note {{temporary created here}}
+  constinit O o2(0); // both-error {{variable does not have a constant initializer}} \
+                     // both-note {{required by 'constinit' specifier}} \
+                     // both-note {{reference to temporary is not a constant expression}} \
+                     // both-note {{temporary created here}}
 }
 #endif
 
@@ -1109,32 +1062,24 @@ namespace AccessOnNullptr {
     int a;
   };
 
-  constexpr int a() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
+  constexpr int a() { // both-error {{never produces a constant expression}}
     F *f = nullptr;
 
-    f->a = 0; // expected-note 2{{cannot access field of null pointer}} \
-              // ref-note 2{{cannot access field of null pointer}}
+    f->a = 0; // both-note 2{{cannot access field of null pointer}}
     return f->a;
   }
-  static_assert(a() == 0, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'a()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'a()'}}
+  static_assert(a() == 0, ""); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to 'a()'}}
 
-  constexpr int a2() { // expected-error {{never produces a constant expression}} \
-                      // ref-error {{never produces a constant expression}}
+  constexpr int a2() { // both-error {{never produces a constant expression}}
     F *f = nullptr;
 
 
-    const int *a = &(f->a); // expected-note 2{{cannot access field of null pointer}} \
-                            // ref-note 2{{cannot access field of null pointer}}
+    const int *a = &(f->a); // both-note 2{{cannot access field of null pointer}}
     return f->a;
   }
-  static_assert(a2() == 0, ""); // expected-error {{not an integral constant expression}} \
-                               // expected-note {{in call to 'a2()'}} \
-                               // ref-error {{not an integral constant expression}} \
-                               // ref-note {{in call to 'a2()'}}
+  static_assert(a2() == 0, ""); // both-error {{not an integral constant expression}} \
+                                // both-note {{in call to 'a2()'}}
 }
 
 namespace IndirectFieldInit {
@@ -1223,3 +1168,74 @@ namespace IndirectFieldInit {
 
 #endif
 }
+
+namespace InheritedConstructor {
+  namespace PR47555 {
+    struct A {
+      int c;
+      int d;
+      constexpr A(int c, int d) : c(c), d(d){}
+    };
+    struct B : A { using A::A; };
+
+    constexpr B b = {13, 1};
+    static_assert(b.c == 13, "");
+    static_assert(b.d == 1, "");
+  }
+
+  namespace PR47555_2 {
+    struct A {
+      int c;
+      int d;
+      double e;
+      constexpr A(int c, int &d, double e) : c(c), d(++d), e(e){}
+    };
+    struct B : A { using A::A; };
+
+    constexpr int f() {
+      int a = 10;
+      B b = {10, a, 40.0};
+      return a;
+    }
+    static_assert(f() == 11, "");
+  }
+
+  namespace AaronsTest {
+    struct T {
+      constexpr T(float) {}
+    };
+
+    struct Base {
+      constexpr Base(T t = 1.0f) {}
+      constexpr Base(float) {}
+    };
+
+    struct FirstMiddle : Base {
+      using Base::Base;
+      constexpr FirstMiddle() : Base(2.0f) {}
+    };
+
+    struct SecondMiddle : Base {
+      constexpr SecondMiddle() : Base(3.0f) {}
+      constexpr SecondMiddle(T t) : Base(t) {}
+    };
+
+    struct S : FirstMiddle, SecondMiddle {
+      using FirstMiddle::FirstMiddle;
+      constexpr S(int i) : S(4.0f) {}
+    };
+
+    constexpr S s(1);
+  }
+}
+
+namespace InvalidCtorInitializer {
+  struct X {
+    int Y;
+    constexpr X() // expected-note {{declared here}}
+        : Y(fo_o_()) {} // both-error {{use of undeclared identifier 'fo_o_'}}
+  };
+  // no crash on evaluating the constexpr ctor.
+  constexpr int Z = X().Y; // both-error {{constexpr variable 'Z' must be initialized by a constant expression}} \
+                           // expected-note {{undefined constructor 'X'}}
+}
diff --git a/clang/test/AST/ast-print-method-decl.cpp b/clang/test/AST/ast-print-method-decl.cpp
index 9f5d11260994..75dea0cac16b 100644
--- a/clang/test/AST/ast-print-method-decl.cpp
+++ b/clang/test/AST/ast-print-method-decl.cpp
@@ -32,8 +32,7 @@ struct DelegatingCtor2 {
 
 // CHECK: struct DelegatingCtor3 {
 struct DelegatingCtor3 {
-  // FIXME: template <> should not be output
-  // CHECK: template <> DelegatingCtor3(auto);
+  // CHECK: DelegatingCtor3(auto);
   DelegatingCtor3(auto);
 
   // FIXME: Implicitly specialized method should not be output
diff --git a/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp b/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp
new file mode 100644
index 000000000000..8b2b4671ed96
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/assignment-to-refptr.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+class Node {
+public:
+    Node* nextSibling() const;
+
+    void ref() const;
+    void deref() const;
+};
+
+static void removeDetachedChildren(Node* firstChild)
+{
+    for (RefPtr child = firstChild; child; child = child->nextSibling());
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp b/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp
new file mode 100644
index 000000000000..176238f31bd2
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/implicit-cast-to-base-class-with-deref-in-superclass.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+class Base {
+public:
+    virtual ~Base();
+    void ref() const;
+    void deref() const;
+};
+
+class Event : public Base {
+protected:
+    explicit Event();
+};
+
+class SubEvent : public Event {
+public:
+    static Ref<SubEvent> create();
+private:
+    SubEvent() = default;
+};
+
+void someFunction(Base&);
+
+static void test()
+{
+    someFunction(SubEvent::create());
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 5f570b8bee8c..cc40487614a8 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -2,13 +2,14 @@
 #define mock_types_1103988513531
 
 template <typename T> struct Ref {
-  T t;
+  T *t;
 
   Ref() : t{} {};
   Ref(T *) {}
-  T *get() { return nullptr; }
-  operator const T &() const { return t; }
-  operator T &() { return t; }
+  T *get() { return t; }
+  T *ptr() { return t; }
+  operator const T &() const { return *t; }
+  operator T &() { return *t; }
 };
 
 template <typename T> struct RefPtr {
@@ -20,6 +21,7 @@ template <typename T> struct RefPtr {
   T *operator->() { return t; }
   T &operator*() { return *t; }
   RefPtr &operator=(T *) { return *this; }
+  operator bool() { return t; }
 };
 
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
@@ -39,6 +41,7 @@ template <typename T> bool operator!=(const RefPtr<T> &, T *) { return false; }
 template <typename T> bool operator!=(const RefPtr<T> &, T &) { return false; }
 
 struct RefCountable {
+  static Ref<RefCountable> create();
   void ref() {}
   void deref() {}
 };
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp b/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp
new file mode 100644
index 000000000000..a1860a5434c8
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-countable-default-arg-nullptr.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+
+#include "mock-types.h"
+
+class Obj {
+public:
+  static Obj* get();
+  static RefPtr<Obj> create();
+  void ref() const;
+  void deref() const;
+};
+
+void someFunction(Obj*, Obj* = nullptr);
+void otherFunction(Obj*, Obj* = Obj::get());
+// expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+void anotherFunction(Obj*, Obj* = Obj::create().get());
+
+void otherFunction() {
+  someFunction(nullptr);
+  someFunction(Obj::get());
+  // expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+  someFunction(Obj::create().get());
+  otherFunction(nullptr);
+  anotherFunction(nullptr);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp
new file mode 100644
index 000000000000..560dcfd4bdb0
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/ref-ptr-accessor.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#include "mock-types.h"
+
+void someFunction(RefCountable*);
+
+void testFunction()
+{
+    Ref item = RefCountable::create();
+    someFunction(item.ptr());
+}
diff --git a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
index 63e22ebdb306..b146068eedb0 100644
--- a/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
+++ b/clang/test/Analysis/Inputs/std-c-library-functions-POSIX.h
@@ -11,6 +11,7 @@ typedef unsigned long int pthread_t;
 typedef unsigned long time_t;
 typedef unsigned long clockid_t;
 typedef __INT64_TYPE__ off64_t;
+typedef __INT64_TYPE__ fpos_t;
 
 typedef struct {
   int a;
@@ -42,9 +43,22 @@ FILE *fopen(const char *restrict pathname, const char *restrict mode);
 FILE *tmpfile(void);
 FILE *freopen(const char *restrict pathname, const char *restrict mode,
               FILE *restrict stream);
+FILE *fdopen(int fd, const char *mode);
 int fclose(FILE *stream);
+int putc(int c, FILE *stream);
+int fputc(int c, FILE *stream);
+char *fgets(char *restrict s, int n, FILE *restrict stream);
+int fputs(const char *restrict s, FILE *restrict stream);
 int fseek(FILE *stream, long offset, int whence);
+int fgetpos(FILE *restrict stream, fpos_t *restrict pos);
+int fsetpos(FILE *stream, const fpos_t *pos);
+int fflush(FILE *stream);
+long ftell(FILE *stream);
 int fileno(FILE *stream);
+void rewind(FILE *stream);
+void clearerr(FILE *stream);
+int feof(FILE *stream);
+int ferror(FILE *stream);
 long a64l(const char *str64);
 char *l64a(long value);
 int open(const char *path, int oflag, ...);
@@ -100,7 +114,6 @@ int pclose(FILE *stream);
 int close(int fildes);
 long fpathconf(int fildes, int name);
 long pathconf(const char *path, int name);
-FILE *fdopen(int fd, const char *mode);
 void rewinddir(DIR *dir);
 void seekdir(DIR *dirp, long loc);
 int rand_r(unsigned int *seedp);
diff --git a/clang/test/Analysis/bitwise-shift-common.c b/clang/test/Analysis/bitwise-shift-common.c
index 39108bc838bf..5f37d9976263 100644
--- a/clang/test/Analysis/bitwise-shift-common.c
+++ b/clang/test/Analysis/bitwise-shift-common.c
@@ -154,7 +154,7 @@ int expression_tracked_back(void) {
 //===----------------------------------------------------------------------===//
 
 int allow_overflows_and_negative_operands(void) {
-  // These are all legal under C++ 20 and many compilers accept them under
+  // These are all legal under C++20 and many compilers accept them under
   // earlier standards as well.
   int int_min = 1 << 31; // no-warning
   int this_overflows = 1027 << 30; // no-warning
diff --git a/clang/test/Analysis/std-c-library-functions-POSIX.c b/clang/test/Analysis/std-c-library-functions-POSIX.c
index 03aa8e2e00a7..b53f3132b868 100644
--- a/clang/test/Analysis/std-c-library-functions-POSIX.c
+++ b/clang/test/Analysis/std-c-library-functions-POSIX.c
@@ -23,10 +23,22 @@
 // CHECK: Loaded summary for: FILE *popen(const char *command, const char *type)
 // CHECK: Loaded summary for: int fclose(FILE *stream)
 // CHECK: Loaded summary for: int pclose(FILE *stream)
+// CHECK: Loaded summary for: int getc(FILE *)
+// CHECK: Loaded summary for: int fgetc(FILE *)
+// CHECK: Loaded summary for: int putc(int c, FILE *stream)
+// CHECK: Loaded summary for: int fputc(int c, FILE *stream)
+// CHECK: Loaded summary for: char *fgets(char *restrict s, int n, FILE *restrict stream)
+// CHECK: Loaded summary for: int fputs(const char *restrict s, FILE *restrict stream)
 // CHECK: Loaded summary for: int fseek(FILE *stream, long offset, int whence)
-// CHECK: Loaded summary for: int fseeko(FILE *stream, off_t offset, int whence)
-// CHECK: Loaded summary for: off_t ftello(FILE *stream)
+// CHECK: Loaded summary for: int fgetpos(FILE *restrict stream, fpos_t *restrict pos)
+// CHECK: Loaded summary for: int fsetpos(FILE *stream, const fpos_t *pos)
+// CHECK: Loaded summary for: int fflush(FILE *stream)
+// CHECK: Loaded summary for: long ftell(FILE *stream)
 // CHECK: Loaded summary for: int fileno(FILE *stream)
+// CHECK: Loaded summary for: void rewind(FILE *stream)
+// CHECK: Loaded summary for: void clearerr(FILE *stream)
+// CHECK: Loaded summary for: int feof(FILE *stream)
+// CHECK: Loaded summary for: int ferror(FILE *stream)
 // CHECK: Loaded summary for: long a64l(const char *str64)
 // CHECK: Loaded summary for: char *l64a(long value)
 // CHECK: Loaded summary for: int open(const char *path, int oflag, ...)
diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c
index b7eb6b284460..e6564e2bae76 100644
--- a/clang/test/Analysis/std-c-library-functions.c
+++ b/clang/test/Analysis/std-c-library-functions.c
@@ -53,8 +53,6 @@
 // CHECK-NEXT: Loaded summary for: int toupper(int)
 // CHECK-NEXT: Loaded summary for: int tolower(int)
 // CHECK-NEXT: Loaded summary for: int toascii(int)
-// CHECK-NEXT: Loaded summary for: int getc(FILE *)
-// CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
 // CHECK-NEXT: Loaded summary for: int getchar(void)
 // CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
@@ -63,6 +61,8 @@
 // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t getdelim(char **restrict, size_t *restrict, int, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: char *getenv(const char *)
+// CHECK-NEXT: Loaded summary for: int getc(FILE *)
+// CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
 
 #include "Inputs/std-c-library-functions.h"
 
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index cd4b0093cfcb..4bab07577ccd 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -491,32 +491,6 @@ void error_ftello(void) {
   fclose(F);
 }
 
-void error_fflush_after_fclose(void) {
-  FILE *F = tmpfile();
-  int Ret;
-  fflush(NULL);                      // no-warning
-  if (!F)
-    return;
-  if ((Ret = fflush(F)) != 0)
-    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-  fclose(F);
-  fflush(F);                         // expected-warning {{Stream might be already closed}}
-}
-
-void error_fflush_on_open_failed_stream(void) {
-  FILE *F = tmpfile();
-  if (!F) {
-    fflush(F); // no-warning
-    return;
-  }
-  fclose(F);
-}
-
-void error_fflush_on_unknown_stream(FILE *F) {
-  fflush(F);   // no-warning
-  fclose(F);   // no-warning
-}
-
 void error_fflush_on_non_null_stream_clear_error_states(void) {
   FILE *F0 = tmpfile(), *F1 = tmpfile();
   // `fflush` clears a non-EOF stream's error state.
diff --git a/clang/test/Analysis/stream-noopen.c b/clang/test/Analysis/stream-noopen.c
index 8ad101ee1e8c..8bd01a90cf85 100644
--- a/clang/test/Analysis/stream-noopen.c
+++ b/clang/test/Analysis/stream-noopen.c
@@ -57,6 +57,95 @@ void test_fwrite(FILE *F) {
   clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
 }
 
+void test_fgetc(FILE *F) {
+  int Ret = fgetc(F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret != EOF) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+                                     // expected-warning@-1 {{FALSE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_fputc(FILE *F) {
+  int Ret = fputc('a', F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret != EOF) {
+    clang_analyzer_eval(Ret == 'a'); // expected-warning {{TRUE}}
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_fgets(char *Buf, int N, FILE *F) {
+  char *Ret = fgets(Buf, N, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(N >= 0); // expected-warning {{TRUE}}
+  if (Ret == Buf) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == 0); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+                                     // expected-warning@-1 {{FALSE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+
+  char Buf1[10];
+  Ret = fgets(Buf1, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}}
+}
+
+void test_fgets_bufsize(FILE *F) {
+  char Buf[10];
+  fgets(Buf, 11, F); // expected-warning {{The 1st argument to 'fgets' is a buffer with size 10}}
+}
+
+void test_fputs(char *Buf, FILE *F) {
+  int Ret = fputs(Buf, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Buf != NULL); // expected-warning {{TRUE}}
+  if (Ret >= 0) {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_ungetc(FILE *F) {
+  int Ret = ungetc('X', F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  if (Ret == 'X') {
+    if (errno) {} // expected-warning {{undefined}}
+  } else {
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  }
+  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
+}
+
+void test_ungetc_EOF(FILE *F, int C) {
+  int Ret = ungetc(EOF, F);
+  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
+  clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
+  Ret = ungetc(C, F);
+  if (Ret == EOF) {
+    clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}}
+                                   // expected-warning@-1{{FALSE}}
+  }
+}
+
 void test_fclose(FILE *F) {
   int Ret = fclose(F);
   clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
@@ -138,28 +227,17 @@ void test_rewind(FILE *F) {
   rewind(F);
 }
 
-void test_ungetc(FILE *F) {
-  int Ret = ungetc('X', F);
-  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
-  if (Ret == 'X') {
-    if (errno) {} // expected-warning {{undefined}}
-  } else {
-    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-    clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
-  }
-  clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}}
-  clang_analyzer_eval(ferror(F)); // expected-warning {{UNKNOWN}}
-}
-
-void test_ungetc_EOF(FILE *F, int C) {
-  int Ret = ungetc(EOF, F);
-  clang_analyzer_eval(F != NULL); // expected-warning {{TRUE}}
-  clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
-  clang_analyzer_eval(errno != 0); // expected-warning {{TRUE}}
-  Ret = ungetc(C, F);
+void test_fflush(FILE *F) {
+  errno = 0;
+  int Ret = fflush(F);
+  clang_analyzer_eval(F != NULL); // expected-warning{{TRUE}}
+                                  // expected-warning@-1{{FALSE}}
   if (Ret == EOF) {
-    clang_analyzer_eval(C == EOF); // expected-warning {{TRUE}}
-                                   // expected-warning@-1{{FALSE}}
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+  } else {
+    clang_analyzer_eval(Ret == 0); // expected-warning{{TRUE}}
+    clang_analyzer_eval(errno == 0); // expected-warning{{TRUE}}
+                                     // expected-warning@-1{{FALSE}}
   }
 }
 
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 36a9b4e26b07..378c9154f8f6 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,7 +1,9 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify %s
 
 #include "Inputs/system-header-simulator.h"
 
+void clang_analyzer_eval(int);
+
 void check_fread(void) {
   FILE *fp = tmpfile();
   fread(0, 0, 0, fp); // expected-warning {{Stream pointer might be NULL}}
@@ -316,3 +318,24 @@ void check_leak_noreturn_2(void) {
 } // expected-warning {{Opened stream never closed. Potential resource leak}}
 // FIXME: This warning should be placed at the `return` above.
 // See https://reviews.llvm.org/D83120 about details.
+
+void fflush_after_fclose(void) {
+  FILE *F = tmpfile();
+  int Ret;
+  fflush(NULL);                      // no-warning
+  if (!F)
+    return;
+  if ((Ret = fflush(F)) != 0)
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  fclose(F);
+  fflush(F);                         // expected-warning {{Stream might be already closed}}
+}
+
+void fflush_on_open_failed_stream(void) {
+  FILE *F = tmpfile();
+  if (!F) {
+    fflush(F); // no-warning
+    return;
+  }
+  fclose(F);
+}
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp
new file mode 100644
index 000000000000..469c4e091953
--- /dev/null
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p23.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify %s
+
+// FIXME: This should be an error with -pedantic-errors.
+template<> // expected-warning {{extraneous template parameter list in template specialization}}
+void f(auto);
+
+template<typename>
+void f(auto);
+
+template<typename T>
+struct A {
+  void g(auto);
+};
+
+template<typename T>
+void A<T>::g(auto) { }
+
+template<>
+void A<int>::g(auto) { }
+
+// FIXME: This should be an error with -pedantic-errors.
+template<>
+template<> // expected-warning {{extraneous template parameter list in template specialization}}
+void A<long>::g(auto) { }
diff --git a/clang/test/CXX/drs/dr11xx.cpp b/clang/test/CXX/drs/dr11xx.cpp
index 86e726ae8c74..a71a105c7eb2 100644
--- a/clang/test/CXX/drs/dr11xx.cpp
+++ b/clang/test/CXX/drs/dr11xx.cpp
@@ -70,3 +70,5 @@ namespace dr1113 { // dr1113: partial
   }
   void g() { f(); }
 }
+
+// dr1150: na
diff --git a/clang/test/CXX/drs/dr124.cpp b/clang/test/CXX/drs/dr124.cpp
new file mode 100644
index 000000000000..c07beb11709c
--- /dev/null
+++ b/clang/test/CXX/drs/dr124.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr124 { // dr124: 2.7
+
+extern void full_expr_fence() NOTHROW;
+
+struct A {
+  A() NOTHROW {}
+  ~A() NOTHROW {}
+};
+
+struct B {
+  B(A = A()) NOTHROW {}
+  ~B() NOTHROW {}
+};
+
+void f() {
+  full_expr_fence();
+  B b[2];
+  full_expr_fence();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr124::f()()
+// CHECK:         call void @dr124::full_expr_fence()
+// CHECK:         br label %arrayctor.loop
+// CHECK-LABEL: arrayctor.loop:
+// CHECK:         call void @dr124::A::A()
+// CHECK:         call void @dr124::B::B(dr124::A)
+// CHECK:         call void @dr124::A::~A()
+// CHECK:         br {{.*}}, label %arrayctor.cont, label %arrayctor.loop
+// CHECK-LABEL: arrayctor.cont:
+// CHECK:         call void @dr124::full_expr_fence()
+// CHECK:         br label %arraydestroy.body
+// CHECK-LABEL: arraydestroy.body:
+// CHECK:         call void @dr124::B::~B()
+// CHECK-LABEL: }
+
+
+} // namespace dr124
diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp
index d262f6f9dcab..58a2b3a0d027 100644
--- a/clang/test/CXX/drs/dr14xx.cpp
+++ b/clang/test/CXX/drs/dr14xx.cpp
@@ -614,6 +614,30 @@ enum E2 : S<E2>::I { e };
 #endif
 } // namespace dr1482
 
+namespace dr1487 { // dr1487: 3.3
+#if __cplusplus >= 201103L
+struct A { // #dr1482-A
+  struct B {
+    using A::A;
+    // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'B'}}
+  };
+
+  struct C : A {
+  // since-cxx11-error@-1 {{base class has incomplete type}}
+  //   since-cxx11-note@#dr1482-A {{definition of 'dr1487::A' is not complete until the closing '}'}}
+    using A::A;
+    // since-cxx11-error@-1 {{using declaration refers into 'A::', which is not a base class of 'C'}}
+  };
+
+  struct D;
+};
+
+struct D : A {
+  using A::A;
+};
+#endif
+} // namespace dr1487
+
 namespace dr1490 {  // dr1490: 3.7 c++11
 #if __cplusplus >= 201103L
   // List-initialization from a string literal
diff --git a/clang/test/CXX/drs/dr15xx.cpp b/clang/test/CXX/drs/dr15xx.cpp
index 3d4050a5713f..ac503db625ba 100644
--- a/clang/test/CXX/drs/dr15xx.cpp
+++ b/clang/test/CXX/drs/dr15xx.cpp
@@ -360,6 +360,45 @@ namespace dr1563 { // dr1563: yes
 #endif
 }
 
+namespace dr1567 { // dr1567: 3.3
+#if __cplusplus >= 201103L
+struct B;
+struct A {
+  A(const A&);
+  A(const B&) = delete;
+  A(A&&);
+  A(B&&) = delete;
+  A(int); // #dr1567-A-int
+};
+
+struct B: A { // #dr1567-B
+  using A::A; // #dr1567-using-A
+  B(double); // #dr1567-B-double
+};
+
+A a{0};
+B b{1.0};
+// Good, deleted converting ctors are not inherited as copy/move ctors
+B b2{b};
+B b3{B{1.0}};
+// Good, copy/move ctors are not inherited
+B b4{a};
+// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}}
+//   since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}}
+//   since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}}
+B b5{A{0}};
+// since-cxx11-error@-1 {{no matching constructor for initialization of 'B'}}
+//   since-cxx11-note@#dr1567-A-int {{candidate inherited constructor not viable: no known conversion from 'A' to 'int' for 1st argument}}
+//   since-cxx11-note@#dr1567-using-A {{constructor from base class 'A' inherited here}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'A' to 'const B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}}
+//   since-cxx11-note@#dr1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}}
+#endif
+}
+
 namespace dr1573 { // dr1573: 3.9
 #if __cplusplus >= 201103L
   // ellipsis is inherited (p0136r1 supersedes this part).
diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/dr17xx.cpp
index 885ed00ace0f..2f7e62da7bb6 100644
--- a/clang/test/CXX/drs/dr17xx.cpp
+++ b/clang/test/CXX/drs/dr17xx.cpp
@@ -89,6 +89,23 @@ S s(q); // #dr1736-s
 #endif
 }
 
+namespace dr1738 { // dr1738: sup P0136R1
+#if __cplusplus >= 201103L
+struct A {
+  template <typename T>
+  A(int, T) {}
+};
+
+struct B : A {
+  using A::A;
+};
+
+// FIXME: this is well-formed since P0136R1
+template B::B(int, double);
+// since-cxx11-error@-1 {{explicit instantiation of 'B' does not refer to a function template, variable template, member function, member class, or static data member}}
+#endif
+}
+
 // dr1748 is in dr1748.cpp
 
 namespace dr1753 { // dr1753: 11
diff --git a/clang/test/CXX/drs/dr185.cpp b/clang/test/CXX/drs/dr185.cpp
new file mode 100644
index 000000000000..aff00f1a8764
--- /dev/null
+++ b/clang/test/CXX/drs/dr185.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+namespace dr185 { // dr185: 2.7
+struct A {
+  mutable int value;
+  explicit A(int i) : value(i) {}
+  void mutate(int i) const { value = i; }
+};
+
+int foo() {
+  A const& t = A(1);
+  A n(t);
+  t.mutate(2);
+  return n.value;
+}
+
+// CHECK-LABEL: define {{.*}} i32 @dr185::foo()
+// CHECK:         call void @dr185::A::A(int)(ptr {{[^,]*}} %ref.tmp, {{.*}})
+// CHECK:         store ptr %ref.tmp, ptr %t
+// CHECK-NOT:     %t =
+// CHECK:         [[DR185_T:%.+]] = load ptr, ptr %t
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr {{[^,]*}} %n, ptr {{[^,]*}} [[DR185_T]], {{.*}})
+// CHECK-LABEL: }
+} // namespace dr185
diff --git a/clang/test/CXX/drs/dr193.cpp b/clang/test/CXX/drs/dr193.cpp
new file mode 100644
index 000000000000..c010dad50e40
--- /dev/null
+++ b/clang/test/CXX/drs/dr193.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr193 { // dr193: 2.7
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  ~B() NOTHROW {}
+};
+
+struct C {
+  ~C() NOTHROW {}
+};
+
+struct D : A {
+  B b;
+  ~D() NOTHROW { C c; }
+};
+
+void foo() {
+  D d;
+}
+
+// skipping over D1 (complete object destructor)
+// CHECK-LABEL: define {{.*}} void @dr193::D::~D(){{.*}}
+// CHECK-LABEL: define {{.*}} void @dr193::D::~D(){{.*}}
+// CHECK-NOT:     call void @dr193::A::~A()
+// CHECK-NOT:     call void @dr193::B::~B()
+// CHECK:         call void @dr193::C::~C()
+// CHECK:         call void @dr193::B::~B()
+// CHECK:         call void @dr193::A::~A()
+// CHECK-LABEL: }
+} // namespace dr193
diff --git a/clang/test/CXX/drs/dr199.cpp b/clang/test/CXX/drs/dr199.cpp
new file mode 100644
index 000000000000..7517d79680c6
--- /dev/null
+++ b/clang/test/CXX/drs/dr199.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr199 { // dr199: 2.8
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  ~B() NOTHROW {}
+};
+
+void foo() {
+  A(), B();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr199::foo()
+// CHECK-NOT:     call void @dr199::A::~A()
+// CHECK:         call void @dr199::B::~B()
+// CHECK:         call void @dr199::A::~A()
+// CHECK-LABEL: }
+} // namespace dr199
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index 1930de2f070a..d55033cef1b6 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -306,7 +306,7 @@ namespace dr122 { // dr122: yes
 }
 
 // dr123: na
-// dr124: dup 201
+// dr124 is in dr124.cpp
 
 // dr125: yes
 struct dr125_A { struct dr125_B {}; }; // #dr125_B
@@ -1169,7 +1169,7 @@ namespace dr184 { // dr184: yes
   void h() { A<B>().g(); }
 }
 
-// dr185 FIXME: add codegen test
+// dr185 is in dr185.cpp
 
 namespace dr187 { // dr187: sup 481
   const int Z = 1;
@@ -1184,6 +1184,7 @@ namespace dr188 { // dr188: yes
 }
 
 // dr190 FIXME: add codegen test for tbaa
+//              or implement C++20 std::is_layout_compatible and test it this way
 
 int dr191_j;
 namespace dr191 { // dr191: yes
@@ -1215,7 +1216,7 @@ namespace dr191 { // dr191: yes
   }
 }
 
-// dr193 FIXME: add codegen test
+// dr193 is in dr193.cpp
 
 namespace dr194 { // dr194: yes
   struct A {
@@ -1290,4 +1291,4 @@ namespace dr198 { // dr198: yes
   };
 }
 
-// dr199 FIXME: add codegen test
+// dr199 is in dr199.cpp
diff --git a/clang/test/CXX/drs/dr201.cpp b/clang/test/CXX/drs/dr201.cpp
new file mode 100644
index 000000000000..7e864981e13b
--- /dev/null
+++ b/clang/test/CXX/drs/dr201.cpp
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define NOTHROW throw()
+#else
+#define NOTHROW noexcept(true)
+#endif
+
+namespace dr201 { // dr201: 2.8
+
+extern void full_expr_fence() NOTHROW;
+
+struct A {
+  ~A() NOTHROW {}
+};
+
+struct B {
+  B(A) NOTHROW {}
+  ~B() NOTHROW {}
+};
+
+void foo() {
+  full_expr_fence();
+  B b = A();
+  full_expr_fence();
+}
+
+// CHECK-LABEL: define {{.*}} void @dr201::foo()
+// CHECK:         call void @dr201::full_expr_fence()
+// CHECK:         call void @dr201::B::B(dr201::A)
+// CHECK:         call void @dr201::A::~A()
+// CHECK:         call void @dr201::full_expr_fence()
+// CHECK:         call void @dr201::B::~B()
+// CHECK-LABEL: }
+
+} // namespace dr201
diff --git a/clang/test/CXX/drs/dr210.cpp b/clang/test/CXX/drs/dr210.cpp
new file mode 100644
index 000000000000..156ee81093b4
--- /dev/null
+++ b/clang/test/CXX/drs/dr210.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wvariadic-macros"
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+#pragma clang diagnostic pop
+#endif
+
+namespace dr210 { // dr210: 2.7
+struct B {
+  long i;
+  B();
+  virtual ~B();
+};
+
+static_assert(sizeof(B) == 16, "");
+
+struct D : B {
+  long j;
+  D();
+};
+
+static_assert(sizeof(D) == 24, "");
+
+void toss(const B* b) {
+  throw *b;
+}
+
+// CHECK-LABEL: define {{.*}} void @dr210::toss(dr210::B const*)
+// CHECK:         %[[EXCEPTION:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// CHECK:         call void @__cxa_throw(ptr %[[EXCEPTION]], ptr @typeinfo for dr210::B, ptr @dr210::B::~B())
+// CHECK-LABEL: }
+
+} // namespace dr210
diff --git a/clang/test/CXX/drs/dr22xx.cpp b/clang/test/CXX/drs/dr22xx.cpp
index 19518247b528..3a13cb0471a7 100644
--- a/clang/test/CXX/drs/dr22xx.cpp
+++ b/clang/test/CXX/drs/dr22xx.cpp
@@ -154,6 +154,47 @@ const D &d3(c); // FIXME ill-formed
 #endif
 }
 
+namespace dr2273 { // dr2273: 3.3
+#if __cplusplus >= 201103L
+struct A {
+  A(int = 0) = delete; // #dr2273-A
+};
+
+struct B : A { // #dr2273-B
+  using A::A;
+};
+
+B b;
+// since-cxx11-error@-1 {{call to implicitly-deleted default constructor of 'B'}}
+//   since-cxx11-note@#dr2273-B {{default constructor of 'B' is implicitly deleted because base class 'A' has a deleted default constructor}}
+//   since-cxx11-note@#dr2273-A {{'A' has been explicitly marked deleted here}}
+#endif
+}
+
+namespace dr2277 { // dr2277: partial
+#if __cplusplus >= 201103L
+struct A {
+  A(int, int = 0);
+  void f(int, int = 0); // #dr2277-A-f
+};
+struct B : A {
+  B(int);
+  using A::A;
+
+  void f(int); // #dr2277-B-f
+  using A::f;
+};
+
+void g() {
+  B b{0};
+  b.f(0); // FIXME: this is well-formed for the same reason as initialization of 'b' above
+  // since-cxx11-error@-1 {{call to member function 'f' is ambiguous}}
+  //   since-cxx11-note@#dr2277-A-f {{candidate function}}
+  //   since-cxx11-note@#dr2277-B-f {{candidate function}}
+}
+#endif
+}
+
 namespace dr2292 { // dr2292: 9
 #if __cplusplus >= 201103L
   template<typename T> using id = T;
diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp
index 3f8c476427eb..c0463730b6a2 100644
--- a/clang/test/CXX/drs/dr23xx.cpp
+++ b/clang/test/CXX/drs/dr23xx.cpp
@@ -147,6 +147,31 @@ enum struct alignas(64) B {};
 #endif
 } // namespace dr2354
 
+namespace dr2356 { // dr2356: 4
+#if __cplusplus >= 201103L
+struct A {
+  A();
+  A(A &&);                        // #1
+  template<typename T> A(T &&);   // #2
+};
+struct B : A {
+  using A::A;
+  B(const B &);                   // #3
+  B(B &&) = default;              // #4, implicitly deleted
+  // since-cxx11-warning@-1 {{explicitly defaulted move constructor is implicitly deleted}}
+  //   since-cxx11-note@#dr2356-X {{move constructor of 'B' is implicitly deleted because field 'x' has a deleted move constructor}}
+  //   since-cxx11-note@#dr2356-X {{'X' has been explicitly marked deleted here}}
+  //   since-cxx11-note@-4 {{replace 'default' with 'delete'}}
+
+  struct X { X(X &&) = delete; } x; // #dr2356-X
+};
+extern B b1;
+B b2 = static_cast<B&&>(b1);      // calls #3: #1, #2, and #4 are not viable
+struct C { operator B&&(); };
+B b3 = C();                       // calls #3
+#endif
+}
+
 #if __cplusplus >= 201402L
 namespace dr2358 { // dr2358: 16
   void f2() {
diff --git a/clang/test/CXX/drs/dr2504.cpp b/clang/test/CXX/drs/dr2504.cpp
new file mode 100644
index 000000000000..686ea73cd6a0
--- /dev/null
+++ b/clang/test/CXX/drs/dr2504.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+
+namespace dr2504 { // dr2504: no
+#if __cplusplus >= 201103L
+struct V { V() = default; V(int); };
+struct Q { Q(); };
+struct A : virtual V, Q {
+  using V::V;
+  A() = delete;
+};
+int bar() { return 42; }
+struct B : A {
+  B() : A(bar()) {}  // ok
+};
+struct C : B {};
+void foo() { C c; } // bar is not invoked, because the V subobject is not initialized as part of B
+#endif
+}
+
+// FIXME: As specified in the comment above (which comes from an example in the Standard),
+//        we are not supposed to unconditionally call `bar()` and call a constructor
+//        inherited from `V`.
+
+// SINCE-CXX11-LABEL: define linkonce_odr void @dr2504::B::B()
+// SINCE-CXX11-NOT:     br
+// SINCE-CXX11:         call noundef i32 @dr2504::bar()
+// SINCE-CXX11-NOT:     br
+// SINCE-CXX11:         call void @dr2504::A::A(int)
+// SINCE-CXX11-LABEL: }
+
+// CHECK: {{.*}}
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 502f03271d9a..b1e54804fc89 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -10,6 +10,8 @@
 // expected-no-diagnostics
 #endif
 
+// dr2504 is in dr2504.cpp
+
 namespace dr2516 { // dr2516: 3.0
                    // NB: reusing 1482 test
 #if __cplusplus >= 201103L
diff --git a/clang/test/CXX/drs/dr292.cpp b/clang/test/CXX/drs/dr292.cpp
new file mode 100644
index 000000000000..19caeef291fa
--- /dev/null
+++ b/clang/test/CXX/drs/dr292.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+
+namespace dr292 { // dr292: 2.9
+
+extern int g();
+
+struct A {
+  A(int) throw() {}
+};
+
+void f() {
+  new A(g());
+}
+
+// CHECK-LABEL: define {{.*}} void @dr292::f()()
+// CHECK:         %[[CALL:.+]] = call {{.*}} @operator new(unsigned long)({{.*}})
+// CHECK:         invoke {{.*}} i32 @dr292::g()()
+// CHECK-NEXT:           to {{.*}} unwind label %lpad
+// CHECK-LABEL: lpad:
+// CHECK:         call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// CHECK-LABEL: eh.resume:
+// CHECK-LABEL: }
+
+} // namespace dr292
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index 1a3ac532f93b..cbb8734e10c6 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -26,7 +26,7 @@ namespace dr200 { // dr200: dup 214
   }
 }
 
-// dr201 FIXME: write codegen test
+// dr201 is in dr201.cpp
 
 namespace dr202 { // dr202: 3.1
   template<typename T> T f();
@@ -76,7 +76,7 @@ namespace dr209 { // dr209: 3.2
   };
 }
 
-// dr210 FIXME: write codegen test
+// dr210 is in dr210.cpp
 
 namespace dr211 { // dr211: yes
   struct A {
@@ -1188,7 +1188,7 @@ namespace dr289 { // dr289: yes
 
 // dr290: na
 // dr291: dup 391
-// dr292 FIXME: write a codegen test
+// dr292 is in dr292.cpp
 
 namespace dr294 { // dr294: no
   void f() throw(int);
diff --git a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
index dae1ba760cc2..db3e3e3bc859 100644
--- a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
+++ b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
@@ -97,13 +97,16 @@ namespace non_template
   static_assert(is_same_v<decltype(baz<int>()), int>); // expected-error {{call to 'baz' is ambiguous}}
   static_assert(is_same_v<decltype(bar<int>()), void>); // expected-error {{call to 'bar' is ambiguous}}
 
+  // Top-level cv-qualifiers are ignored in template partial ordering per [dcl.fct]/p5.
+  //   After producing the list of parameter types, any top-level cv-qualifiers modifying
+  //   a parameter type are deleted when forming the function type.
   template<typename T>
-  constexpr int goo(int a) requires AtLeast2<int> && true { // expected-note {{candidate function}}
+  constexpr int goo(T a) requires AtLeast2<T> && true {
     return 1;
   }
 
   template<typename T>
-  constexpr int goo(const int b) requires AtLeast2<int> { // expected-note {{candidate function}}
+  constexpr int goo(const T b) requires AtLeast2<T> {
     return 2;
   }
 
@@ -122,7 +125,6 @@ namespace non_template
     return 2;
   }
 
-  // By temp.func.order-6.2.2, this is ambiguous because parameter a and b have different types.
-  static_assert(goo<int>(1) == 1); // expected-error {{call to 'goo' is ambiguous}}
+  static_assert(goo<int>(1) == 1);
   static_assert(doo<int>(2) == 1);
 }
diff --git a/clang/test/CodeGen/LoongArch/atomics.c b/clang/test/CodeGen/LoongArch/atomics.c
index edc58d30db18..bd51fea661be 100644
--- a/clang/test/CodeGen/LoongArch/atomics.c
+++ b/clang/test/CodeGen/LoongArch/atomics.c
@@ -11,10 +11,10 @@
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
   // LA32: load atomic i8, ptr %a seq_cst, align 1
   // LA32: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA32: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   // LA64: load atomic i8, ptr %a seq_cst, align 1
   // LA64: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA64: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
@@ -23,22 +23,22 @@ void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
   // LA32: load atomic i32, ptr %a seq_cst, align 4
   // LA32: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA32: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   // LA64: load atomic i32, ptr %a seq_cst, align 4
   // LA64: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA64: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // LA32: call i64 @__atomic_load_8
-  // LA32: call void @__atomic_store_8
-  // LA32: call i64 @__atomic_fetch_add_8
+  // LA32: load atomic i64, ptr %a seq_cst, align 8
+  // LA32: store atomic i64 %b, ptr %a seq_cst, align 8
+  // LA32: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   // LA64: load atomic i64, ptr %a seq_cst, align 8
   // LA64: store atomic i64 %b, ptr %a seq_cst, align 8
-  // LA64: atomicrmw add ptr %a, i64 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
diff --git a/clang/test/CodeGen/PowerPC/quadword-atomics.c b/clang/test/CodeGen/PowerPC/quadword-atomics.c
index bff03b25d27e..dc04423060a0 100644
--- a/clang/test/CodeGen/PowerPC/quadword-atomics.c
+++ b/clang/test/CodeGen/PowerPC/quadword-atomics.c
@@ -1,14 +1,18 @@
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
-// RUN:   --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 
 
 typedef struct {
@@ -19,66 +23,48 @@ typedef _Atomic(Q) AtomicQ;
 
 typedef __int128_t int128_t;
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_load(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
-//
 // PPC64-LABEL: @test_load(
-// PPC64:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP3:%.*]], ptr noundef [[TMP4:%.*]], i32 noundef signext 2)
+// PPC64:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
 //
 Q test_load(AtomicQ *ptr) {
   // expected-no-diagnostics
   return __c11_atomic_load(ptr, __ATOMIC_ACQUIRE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_store(
-// PPC64-QUADWORD-ATOMICS:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
-//
 // PPC64-LABEL: @test_store(
-// PPC64:    call void @__atomic_store(i64 noundef 16, ptr noundef [[TMP6:%.*]], ptr noundef [[TMP7:%.*]], i32 noundef signext 3)
+// PPC64:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
 //
 void test_store(Q val, AtomicQ *ptr) {
   // expected-no-diagnostics
   __c11_atomic_store(ptr, val, __ATOMIC_RELEASE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_add(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
-//
 // PPC64-LABEL: @test_add(
-// PPC64:    [[CALL:%.*]] = call i128 @__atomic_fetch_add_16(ptr noundef [[TMP2:%.*]], i128 noundef [[TMP3:%.*]], i32 noundef signext 0)
+// PPC64:    [[ATOMICRMW:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
 //
 void test_add(_Atomic(int128_t) *ptr, int128_t x) {
   // expected-no-diagnostics
   __c11_atomic_fetch_add(ptr, x, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_xchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
-//
 // PPC64-LABEL: @test_xchg(
-// PPC64:    call void @__atomic_exchange(i64 noundef 16, ptr noundef [[TMP7:%.*]], ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], i32 noundef signext 5)
+// PPC64:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
 //
 Q test_xchg(AtomicQ *ptr, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_exchange(ptr, new, __ATOMIC_SEQ_CST);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_compare_exchange_strong(ptr, cmp, new, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg_weak(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg_weak(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
@@ -88,8 +74,8 @@ int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
 // PPC64-QUADWORD-ATOMICS-LABEL: @is_lock_free(
 // PPC64-QUADWORD-ATOMICS:    ret i32 1
 //
-// PPC64-LABEL: @is_lock_free(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
+// PPC64-NO-QUADWORD-ATOMICS-LABEL: @is_lock_free(
+// PPC64-NO-QUADWORD-ATOMICS:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
 //
 int is_lock_free() {
   AtomicQ q;
diff --git a/clang/test/CodeGen/RISCV/riscv-atomics.c b/clang/test/CodeGen/RISCV/riscv-atomics.c
index f629ad7d72ea..437cb949bbb0 100644
--- a/clang/test/CodeGen/RISCV/riscv-atomics.c
+++ b/clang/test/CodeGen/RISCV/riscv-atomics.c
@@ -1,68 +1,34 @@
 // RUN: %clang_cc1 -triple riscv32 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv32 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32IA
+// RUN:   -verify=small-atomics
 // RUN: %clang_cc1 -triple riscv64 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv64 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64IA
+// RUN:   -verify=all-atomics
 
-// This test demonstrates that MaxAtomicInlineWidth is set appropriately when
-// the atomics instruction set extension is enabled.
+// all-atomics-no-diagnostics
 
 #include <stdatomic.h>
 #include <stdint.h>
 
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
-  // RV32I:  call zeroext i8 @__atomic_load_1
-  // RV32I:  call void @__atomic_store_1
-  // RV32I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV32IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV32IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV32IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  // RV64I:  call zeroext i8 @__atomic_load_1
-  // RV64I:  call void @__atomic_store_1
-  // RV64I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV64IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV64IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV64IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
-  // RV32I:  call i32 @__atomic_load_4
-  // RV32I:  call void @__atomic_store_4
-  // RV32I:  call i32 @__atomic_fetch_add_4
-  // RV32IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV32IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV32IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  // RV64I:  call signext i32 @__atomic_load_4
-  // RV64I:  call void @__atomic_store_4
-  // RV64I:  call signext i32 @__atomic_fetch_add_4
-  // RV64IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV64IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV64IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // RV32I:  call i64 @__atomic_load_8
-  // RV32I:  call void @__atomic_store_8
-  // RV32I:  call i64 @__atomic_fetch_add_8
-  // RV32IA: call i64 @__atomic_load_8
-  // RV32IA: call void @__atomic_store_8
-  // RV32IA: call i64 @__atomic_fetch_add_8
-  // RV64I:  call i64 @__atomic_load_8
-  // RV64I:  call void @__atomic_store_8
-  // RV64I:  call i64 @__atomic_fetch_add_8
-  // RV64IA: load atomic i64, ptr %a seq_cst, align 8
-  // RV64IA: store atomic i64 %b, ptr %a seq_cst, align 8
-  // RV64IA: atomicrmw add ptr %a, i64 %b seq_cst, align 8
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@28 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@30 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning@32 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
 }
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
index 4f6dcbc2c01e..8759df7b19c6 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
@@ -20,7 +20,8 @@ __int128 Des;
 
 // CHECK-LABEL: @f1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -29,8 +30,8 @@ __int128 f1() {
 
 // CHECK-LABEL: @f2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 8
 // CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
@@ -41,10 +42,8 @@ __int128 f2() {
 
 // CHECK-LABEL: @f3(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5)
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f3() {
@@ -53,7 +52,8 @@ void f3() {
 
 // CHECK-LABEL: @f4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f4() {
@@ -62,10 +62,9 @@ void f4() {
 
 // CHECK-LABEL: @f5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -74,9 +73,10 @@ __int128 f5() {
 
 // CHECK-LABEL: @f6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -86,11 +86,17 @@ __int128 f6() {
 
 // CHECK-LABEL: @f7(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f7() {
   return __atomic_compare_exchange_n(&Ptr, &Exp, Des, 0,
@@ -99,8 +105,17 @@ _Bool f7() {
 
 // CHECK-LABEL: @f8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull @Des, i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Des, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f8() {
   return __atomic_compare_exchange(&Ptr, &Exp, &Des, 0,
@@ -109,12 +124,8 @@ _Bool f8() {
 
 // CHECK-LABEL: @f9(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -125,12 +136,8 @@ __int128 f9() {
 
 // CHECK-LABEL: @f10(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -141,12 +148,8 @@ __int128 f10() {
 
 // CHECK-LABEL: @f11(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -157,12 +160,8 @@ __int128 f11() {
 
 // CHECK-LABEL: @f12(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -173,12 +172,8 @@ __int128 f12() {
 
 // CHECK-LABEL: @f13(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -189,12 +184,8 @@ __int128 f13() {
 
 // CHECK-LABEL: @f14(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
 // CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
@@ -206,10 +197,9 @@ __int128 f14() {
 
 // CHECK-LABEL: @f15(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -218,10 +208,9 @@ __int128 f15() {
 
 // CHECK-LABEL: @f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
@@ -230,10 +219,9 @@ __int128 f16() {
 
 // CHECK-LABEL: @f17(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -242,10 +230,9 @@ __int128 f17() {
 
 // CHECK-LABEL: @f18(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
@@ -254,10 +241,9 @@ __int128 f18() {
 
 // CHECK-LABEL: @f19(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
@@ -266,10 +252,9 @@ __int128 f19() {
 
 // CHECK-LABEL: @f20(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
index e6eb98b027bf..5ac8fd138914 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-eabi -target-feature +neon -S -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -S -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed.c b/clang/test/CodeGen/aarch64-ABI-align-packed.c
index 03f4834ccd0b..93b81d50261b 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-eabi -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c b/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
index e5d70564d57b..baef74b4c18c 100644
--- a/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
+++ b/clang/test/CodeGen/aarch64-fix-cortex-a53-835769.c
@@ -1,8 +1,8 @@
-// RUN: %clang -O3 -target aarch64-linux-eabi %s -S -o- \
+// RUN: %clang -O3 --target=aarch64 %s -S -o- \
 // RUN:   | FileCheck --check-prefix=CHECK-NO --check-prefix=CHECK %s
-// RUN: %clang -O3 -target aarch64-linux-eabi -mfix-cortex-a53-835769 %s -S -o- 2>&1 \
+// RUN: %clang -O3 --target=aarch64 -mfix-cortex-a53-835769 %s -S -o- 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-YES --check-prefix=CHECK %s
-// RUN: %clang -O3 -target aarch64-linux-eabi -mno-fix-cortex-a53-835769 %s -S -o- 2>&1 \
+// RUN: %clang -O3 --target=aarch64 -mno-fix-cortex-a53-835769 %s -S -o- 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO --check-prefix=CHECK %s
 
 // RUN: %clang -O3 --target=aarch64-linux-androideabi %s -S -o- \
diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
index 744d6919b05e..0ba12ab47ae5 100644
--- a/clang/test/CodeGen/aarch64-ls64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
 
 struct foo { unsigned long long x[8]; };
 
diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index 8a61a9643dd3..c20be13ed13c 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
 // RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck  --check-prefixes=CHECK-C %s
 // RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck  --check-prefixes=CHECK-CXX %s
 
diff --git a/clang/test/CodeGen/aarch64-matmul.cpp b/clang/test/CodeGen/aarch64-matmul.cpp
index 677d8bf9735b..58deda1c612c 100644
--- a/clang/test/CodeGen/aarch64-matmul.cpp
+++ b/clang/test/CodeGen/aarch64-matmul.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
 
 #ifdef __ARM_FEATURE_MATMUL_INT8
 extern "C" void arm_feature_matmulint8_defined() {}
diff --git a/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
index ab7d75292318..40c5a0a598d6 100644
--- a/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
+++ b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon \
 // RUN:  -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-targetattr-arch.c b/clang/test/CodeGen/aarch64-targetattr-arch.c
index 86ddeac0b9e6..ed731d037862 100644
--- a/clang/test/CodeGen/aarch64-targetattr-arch.c
+++ b/clang/test/CodeGen/aarch64-targetattr-arch.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8a -verify -DHAS8 -S %s -o -
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8.1a -verify -DHAS81 -S %s -o -
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v9a -verify -DHAS9 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8a -verify -DHAS8 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8.1a -verify -DHAS81 -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v9a -verify -DHAS9 -S %s -o -
 // REQUIRES: aarch64-registered-target
 
 #ifdef HAS9
diff --git a/clang/test/CodeGen/aarch64-targetattr-crypto.c b/clang/test/CodeGen/aarch64-targetattr-crypto.c
index d3609240fbd5..006a394be777 100644
--- a/clang/test/CodeGen/aarch64-targetattr-crypto.c
+++ b/clang/test/CodeGen/aarch64-targetattr-crypto.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +v8a -verify -S %s -o -
+// RUN: %clang_cc1 -triple aarch64 -target-feature +v8a -verify -S %s -o -
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 1a3a84a73dba..bf4c1476d881 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-eabi -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -S -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-LABEL: @v82() #0
 __attribute__((target("arch=armv8.2-a")))
diff --git a/clang/test/CodeGen/aarch64-tme.cpp b/clang/test/CodeGen/aarch64-tme.cpp
index 5004751cb300..096a8e4248f6 100644
--- a/clang/test/CodeGen/aarch64-tme.cpp
+++ b/clang/test/CodeGen/aarch64-tme.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64-eabi -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
 
 #define A -1
 constexpr int f() { return 65536; }
diff --git a/clang/test/CodeGen/arm-atomics-m.c b/clang/test/CodeGen/arm-atomics-m.c
index b9cc72bc6b98..6087fd9d6a66 100644
--- a/clang/test/CodeGen/arm-atomics-m.c
+++ b/clang/test/CodeGen/arm-atomics-m.c
@@ -22,14 +22,14 @@ void test_presence(void)
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/arm-atomics-m0.c b/clang/test/CodeGen/arm-atomics-m0.c
index 335a1d2711f8..94e344cf608d 100644
--- a/clang/test/CodeGen/arm-atomics-m0.c
+++ b/clang/test/CodeGen/arm-atomics-m0.c
@@ -11,25 +11,25 @@ typedef enum memory_order {
 void test_presence(void)
 {
   // CHECK-LABEL: @test_presence
-  // CHECK: __atomic_fetch_add_4
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
   __atomic_fetch_add(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_4
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 4
   __atomic_fetch_sub(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_4
+  // CHECK: load atomic i32, ptr {{.*}} seq_cst, align 4
   int r;
   __atomic_load(&i, &r, memory_order_seq_cst);
-  // CHECK: __atomic_store_4
+  // CHECK: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/arm64-mte.c b/clang/test/CodeGen/arm64-mte.c
index 1c65d6a626dd..7dde23cfd8e7 100644
--- a/clang/test/CodeGen/arm64-mte.c
+++ b/clang/test/CodeGen/arm64-mte.c
@@ -1,6 +1,6 @@
 // Test memory tagging extension intrinsics
-// RUN: %clang_cc1 -triple aarch64-none-linux-eabi -target-feature +mte -O3 -S -emit-llvm -o - %s  | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-eabi -DMTE -O3 -S -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +mte -O3 -S -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -DMTE -O3 -S -emit-llvm -o - %s  | FileCheck %s
 #include <stddef.h>
 #include <arm_acle.h>
 
diff --git a/clang/test/CodeGen/atomic-ops-libcall.c b/clang/test/CodeGen/atomic-ops-libcall.c
index 745ccd22bf33..38a23f7236ce 100644
--- a/clang/test/CodeGen/atomic-ops-libcall.c
+++ b/clang/test/CodeGen/atomic-ops-libcall.c
@@ -1,120 +1,338 @@
-// RUN: %clang_cc1 < %s -triple armv5e-none-linux-gnueabi -emit-llvm -O1 | FileCheck %s
-
-// FIXME: This file should not be checking -O1 output.
-// Ie, it is testing many IR optimizer passes as part of front-end verification.
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple armv5e-none-linux-gnueabi -emit-llvm %s -o - | FileCheck %s
 
 enum memory_order {
   memory_order_relaxed, memory_order_consume, memory_order_acquire,
   memory_order_release, memory_order_acq_rel, memory_order_seq_cst
 };
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_add_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 12, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_add_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 12, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_sub_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 20, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_sub_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 20, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_add_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_add_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 3, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_sub_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 5, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_sub_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 5, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @fp2a(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 4, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *fp2a(int **p) {
-  // CHECK: @fp2a
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 4, i32 noundef 0)
   // Note, the GNU builtins do not multiply by sizeof(T)!
   return __atomic_fetch_sub(p, 4, memory_order_relaxed);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_add(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_add(int *p) {
-  // CHECK: test_atomic_fetch_add
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_add(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_sub(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_sub(int *p) {
-  // CHECK: test_atomic_fetch_sub
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_sub(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_and(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_and(int *p) {
-  // CHECK: test_atomic_fetch_and
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_and(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_or(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_or(int *p) {
-  // CHECK: test_atomic_fetch_or
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_or(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_xor(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_xor(int *p) {
-  // CHECK: test_atomic_fetch_xor
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_xor(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_nand(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_nand(int *p) {
-  // CHECK: test_atomic_fetch_nand
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_nand(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_add_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_add_fetch(int *p) {
-  // CHECK: test_atomic_add_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], 55
   return __atomic_add_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_sub_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_sub_fetch(int *p) {
-  // CHECK: test_atomic_sub_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], -55
   return __atomic_sub_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_and_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_and_fetch(int *p) {
-  // CHECK: test_atomic_and_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = and i32 [[CALL]], 55
   return __atomic_and_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_or_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_or_fetch(int *p) {
-  // CHECK: test_atomic_or_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = or i32 [[CALL]], 55
   return __atomic_or_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_xor_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_xor_fetch(int *p) {
-  // CHECK: test_atomic_xor_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = xor i32 [[CALL]], 55
   return __atomic_xor_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_nand_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP3]], -1
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
 int test_atomic_nand_fetch(int *p) {
-  // CHECK: test_atomic_nand_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // FIXME: We should not be checking optimized IR. It changes independently of clang.
-  // FIXME-CHECK: [[AND:%[^ ]*]] = and i32 [[CALL]], 55
-  // FIXME-CHECK: {{%[^ ]*}} = xor i32 [[AND]], -1
   return __atomic_nand_fetch(p, 55, memory_order_seq_cst);
 }
diff --git a/clang/test/CodeGen/atomic-ops.c b/clang/test/CodeGen/atomic-ops.c
index 9ac05d270b97..b6060dcc540f 100644
--- a/clang/test/CodeGen/atomic-ops.c
+++ b/clang/test/CodeGen/atomic-ops.c
@@ -198,7 +198,8 @@ struct S implicit_load(_Atomic(struct S) *a) {
 struct S fd1(struct S *a) {
   // CHECK-LABEL: @fd1
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
-  // CHECK: call void @__atomic_load(i32 noundef 8, ptr noundef {{.*}}, ptr noundef [[RETVAL]], i32 noundef 5)
+  // CHECK: [[TMP1:%.*]] = load atomic i64, ptr {{%.*}} seq_cst, align 4
+  // CHECK-NEXT: store i64 [[TMP1]], ptr [[RETVAL]], align 4
   // CHECK: ret
   struct S ret;
   __atomic_load(a, &ret, memory_order_seq_cst);
@@ -213,7 +214,8 @@ void fd2(struct S *a, struct S *b) {
   // CHECK-NEXT: store ptr %b, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_store(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: store atomic i64 [[LOAD_B]], ptr [[LOAD_A_PTR]] seq_cst, align 4
   // CHECK-NEXT: ret void
   __atomic_store(a, b, memory_order_seq_cst);
 }
@@ -229,7 +231,9 @@ void fd3(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[RESULT:%.*]] = atomicrmw xchg ptr [[LOAD_A_PTR]], i64 [[LOAD_B]] seq_cst, align 4
+  // CHECK-NEXT: store i64 [[RESULT]], ptr [[LOAD_C_PTR]], align 4
 
   __atomic_exchange(a, b, c, memory_order_seq_cst);
 }
@@ -245,8 +249,9 @@ _Bool fd4(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
-  // CHECK-NEXT: ret i1 [[CALL]]
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[LOAD_C:%.*]] = load i64, ptr [[LOAD_C_PTR]], align 4
+  // CHECK-NEXT: {{.*}} = cmpxchg weak ptr [[LOAD_A_PTR]], i64 [[LOAD_B]], i64 [[LOAD_C]] seq_cst seq_cst, align 4
   return __atomic_compare_exchange(a, b, c, 1, 5, 5);
 }
 
@@ -682,13 +687,13 @@ void test_underaligned(void) {
   // CHECK-LABEL: @test_underaligned
   struct Underaligned { char c[8]; } underaligned_a, underaligned_b, underaligned_c;
 
-  // CHECK: call void @__atomic_load(i32 noundef 8,
+  // CHECK: load atomic i64, {{.*}}, align 1
   __atomic_load(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_store(i32 noundef 8,
+  // CHECK: store atomic i64 {{.*}}, align 1
   __atomic_store(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_exchange(i32 noundef 8,
+  // CHECK: atomicrmw xchg ptr {{.*}}, align 1
   __atomic_exchange(&underaligned_a, &underaligned_b, &underaligned_c, memory_order_seq_cst);
-  // CHECK: call {{.*}} @__atomic_compare_exchange(i32 noundef 8,
+  // CHECK: cmpxchg weak ptr {{.*}}, align 1
   __atomic_compare_exchange(&underaligned_a, &underaligned_b, &underaligned_c, 1, memory_order_seq_cst, memory_order_seq_cst);
 
   __attribute__((aligned)) struct Underaligned aligned_a, aligned_b, aligned_c;
@@ -747,7 +752,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
   // CHECK: store i32 [[NEW]], ptr
   *si = __atomic_min_fetch(si, 42, memory_order_release);
-  
+
   // CHECK: [[OLD:%.*]] = atomicrmw umax ptr [[PTR:%.*]], i32 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ugt i32 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
@@ -772,7 +777,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: store i8 [[NEW]], ptr
   *sc = __atomic_min_fetch(sc, 42, memory_order_release);
 
-  // CHECK: [[OLD:%.*]] = call i64 @__atomic_fetch_umin_8(ptr noundef {{%.*}}, i64 noundef [[RHS:%.*]],
+  // CHECK: [[OLD:%.*]] = atomicrmw umin ptr {{%.*}}, i64 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ult i64 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i64 [[OLD]], i64 [[RHS]]
   // CHECK: store i64 [[NEW]], ptr
diff --git a/clang/test/CodeGen/atomics-inlining.c b/clang/test/CodeGen/atomics-inlining.c
index 862c63076b2d..217a294ee84a 100644
--- a/clang/test/CodeGen/atomics-inlining.c
+++ b/clang/test/CodeGen/atomics-inlining.c
@@ -38,14 +38,14 @@ void test1(void) {
   (void)__atomic_store(&a1, &a2, memory_order_seq_cst);
 
 // ARM-LABEL: define{{.*}} void @test1
-// ARM: = call{{.*}} zeroext i8 @__atomic_load_1(ptr noundef @c1
-// ARM: call{{.*}} void @__atomic_store_1(ptr noundef @c1, i8 noundef zeroext
-// ARM: = call{{.*}} zeroext i16 @__atomic_load_2(ptr noundef @s1
-// ARM: call{{.*}} void @__atomic_store_2(ptr noundef @s1, i16 noundef zeroext
-// ARM: = call{{.*}} i32 @__atomic_load_4(ptr noundef @i1
-// ARM: call{{.*}} void @__atomic_store_4(ptr noundef @i1, i32 noundef
-// ARM: = call{{.*}} i64 @__atomic_load_8(ptr noundef @ll1
-// ARM: call{{.*}} void @__atomic_store_8(ptr noundef @ll1, i64 noundef
+// ARM: = load atomic i8, ptr @c1 seq_cst, align 1
+// ARM: store atomic i8 {{.*}}, ptr @c1 seq_cst, align 1
+// ARM: = load atomic i16, ptr @s1 seq_cst, align 2
+// ARM: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
+// ARM: = load atomic i32, ptr @i1 seq_cst, align 4
+// ARM: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
+// ARM: = load atomic i64, ptr @ll1 seq_cst, align 8
+// ARM: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // ARM: call{{.*}} void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // ARM: call{{.*}} void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -56,8 +56,8 @@ void test1(void) {
 // PPC32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // PPC32: = load atomic i32, ptr @i1 seq_cst, align 4
 // PPC32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// PPC32: = call i64 @__atomic_load_8(ptr noundef @ll1
-// PPC32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// PPC32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// PPC32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // PPC32: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // PPC32: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -80,8 +80,8 @@ void test1(void) {
 // MIPS32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // MIPS32: = load atomic i32, ptr @i1 seq_cst, align 4
 // MIPS32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// MIPS32: call i64 @__atomic_load_8(ptr noundef @ll1
-// MIPS32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// MIPS32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// MIPS32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // MIPS32: call void @__atomic_load(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS32: call void @__atomic_store(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 
@@ -94,7 +94,7 @@ void test1(void) {
 // MIPS64: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // MIPS64: = load atomic i64, ptr @ll1 seq_cst, align 8
 // MIPS64: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1
+// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS64: call void @__atomic_store(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 
 // SPARC-LABEL: define{{.*}} void @test1
@@ -104,12 +104,12 @@ void test1(void) {
 // SPARC: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // SPARC: = load atomic i32, ptr @i1 seq_cst, align 4
 // SPARC: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// SPARCV8: call i64 @__atomic_load_8(ptr noundef @ll1
-// SPARCV8: call void @__atomic_store_8(ptr noundef @ll1, i64
-// SPARCV9: load atomic i64, ptr @ll1 seq_cst, align 8
-// SPARCV9: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
+// SPARC: load atomic i64, ptr @ll1 seq_cst, align 8
+// SPARC: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // SPARCV8: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // SPARCV8: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 // NVPTX-LABEL: define{{.*}} void @test1
 // NVPTX: = load atomic i8, ptr @c1 seq_cst, align 1
@@ -120,7 +120,7 @@ void test1(void) {
 // NVPTX: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // NVPTX: = load atomic i64, ptr @ll1 seq_cst, align 8
 // NVPTX: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
-// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
+// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 }
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
index a7b3123e61cd..20fb4a04564c 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
@@ -177,29 +177,26 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 
 // CHECK-64-LABEL: @read_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @read_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i8>, align 16
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_bool1(struct struct_bool1 *s) {
@@ -208,29 +205,26 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 
 // CHECK-64-LABEL: @write_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: @write_bool1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 16
-// CHECK-128-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SAVED_VALUE]], align 16, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
-// CHECK-128-NEXT:    store <16 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_bool1(struct struct_bool1 *s, vbool1_t x) {
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
index 888abe1a7bc3..1824d97d04dd 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
@@ -70,13 +70,7 @@ fixed_float64m1_t call_float64_ff(fixed_float64m1_t op1, fixed_float64m1_t op2)
 
 // CHECK-LABEL: @call_bool1_ff(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2_COERCE:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE4]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[TMP1:%.*]], i64 256)
 // CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
 //
 fixed_bool1_t call_bool1_ff(fixed_bool1_t op1, fixed_bool1_t op2) {
@@ -116,14 +110,8 @@ fixed_float64m1_t call_float64_fs(fixed_float64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_fs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1_COERCE:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE2]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 fixed_bool1_t call_bool1_fs(fixed_bool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
@@ -162,14 +150,8 @@ fixed_float64m1_t call_float64_ss(vfloat64m1_t op1, vfloat64m1_t op2) {
 
 // CHECK-LABEL: @call_bool1_ss(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[OP1:%.*]], <vscale x 64 x i1> [[OP2:%.*]], i64 256)
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0]]
 //
 fixed_bool1_t call_bool1_ss(vbool1_t op1, vbool1_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen);
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index fe278174bf68..3806c3e1b30b 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -65,13 +65,7 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
 
 // CHECK-LABEL: @from_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-NEXT:    store <vscale x 64 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE:%.*]]
 //
 fixed_bool1_t from_vbool1_t(vbool1_t type) {
   return type;
@@ -79,7 +73,7 @@ fixed_bool1_t from_vbool1_t(vbool1_t type) {
 
 // CHECK-LABEL: @to_vbool1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP0:%.*]]
 //
 vbool1_t to_vbool1_t(fixed_bool1_t type) {
   return type;
@@ -105,8 +99,8 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
 // CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
index ac22bdce0da3..eb769fadda9a 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
@@ -53,25 +53,24 @@ fixed_bool32_t global_bool32;
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 64 x i1>, align 1
 // CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 64 x i8>, align 1
 // CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 64 x i1>, align 1
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-NEXT:    store <vscale x 64 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    store <vscale x 64 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 64 x i1>, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @global_bool1, align 8
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i1> @llvm.riscv.vmand.nxv64i1.i64(<vscale x 64 x i1> [[TMP0]], <vscale x 64 x i1> [[TMP2]], i64 256)
 // CHECK-NEXT:    store <vscale x 64 x i1> [[TMP3]], ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 64 x i1>, ptr [[MASK]], align 1
 // CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 64 x i8>, ptr [[VEC_ADDR]], align 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load <256 x i8>, ptr @global_vec_int8m8, align 8
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
-// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP6]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadd.mask.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[TMP5]], <vscale x 64 x i8> [[CAST_SCALABLE1]], <vscale x 64 x i1> [[TMP4]], i64 256, i64 3)
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <256 x i8> @llvm.vector.extract.v256i8.nxv64i8(<vscale x 64 x i8> [[TMP7]], i64 0)
 // CHECK-NEXT:    store <256 x i8> [[CAST_FIXED]], ptr [[RETVAL]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = load <256 x i8>, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE1]]
+// CHECK-NEXT:    [[CAST_SCALABLE2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.v256i8(<vscale x 64 x i8> undef, <256 x i8> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[CAST_SCALABLE2]]
 //
 fixed_int8m8_t test_bool1(vbool1_t m, vint8m8_t vec) {
   vbool1_t mask = __riscv_vmand(m, global_bool1, __riscv_v_fixed_vlen);
@@ -181,15 +180,15 @@ fixed_int32m1_t array_arg(fixed_int32m1_t arr[]) {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <32 x i8>, align 8
 // CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <32 x i8>], align 8
 // CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 64 x i1>, align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <32 x i8>], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 8
 // CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 8 [[RETVAL]], i64 32, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 64 x i1>, ptr [[RETVAL_COERCE]], align 8
-// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
+// CHECK-NEXT:    ret <vscale x 64 x i1> [[TMP3]]
 //
 fixed_bool1_t address_of_array_idx_bool1() {
   fixed_bool1_t arr[3];
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
index d7df1a24bbfb..31a245dcb224 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
@@ -56,18 +56,16 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
 // CHECK-64-LABEL: @write_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-64-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 64 x i1>, align 8
-// CHECK-256-NEXT:    store <vscale x 64 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA7:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V:%.*]] to <vscale x 8 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool1(vbool1_t v) { global_bool1 = v; }
@@ -92,7 +90,7 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 // CHECK-256-LABEL: @write_global_bool32(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA7:![0-9]+]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
@@ -120,18 +118,16 @@ vint64m1_t read_global_i64() { return global_i64; }
 
 // CHECK-64-LABEL: @read_global_bool1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    store <8 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> undef, <8 x i8> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 // CHECK-256-LABEL: @read_global_bool1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <32 x i8>, align 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    store <32 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 64 x i1>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> undef, <32 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
 //
 vbool1_t read_global_bool1() { return global_bool1; }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 2a96697e4291..c27d48f3ecf6 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -90,13 +90,20 @@ int hoo(void) {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @fmv.ifunc = weak_odr ifunc i32 (), ptr @fmv.resolver
-// CHECK: @fmv_one.ifunc = weak_odr ifunc i32 (), ptr @fmv_one.resolver
-// CHECK: @fmv_two.ifunc = weak_odr ifunc i32 (), ptr @fmv_two.resolver
-// CHECK: @fmv_e.ifunc = weak_odr ifunc i32 (), ptr @fmv_e.resolver
-// CHECK: @fmv_c.ifunc = weak_odr ifunc void (), ptr @fmv_c.resolver
-// CHECK: @fmv_inline.ifunc = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
-// CHECK: @fmv_d.ifunc = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv.ifunc = weak_odr alias i32 (), ptr @fmv
+// CHECK: @fmv_one.ifunc = weak_odr alias i32 (), ptr @fmv_one
+// CHECK: @fmv_two.ifunc = weak_odr alias i32 (), ptr @fmv_two
+// CHECK: @fmv_e.ifunc = weak_odr alias i32 (), ptr @fmv_e
+// CHECK: @fmv_inline.ifunc = weak_odr alias i32 (), ptr @fmv_inline
+// CHECK: @fmv_d.ifunc = internal alias i32 (), ptr @fmv_d
+// CHECK: @fmv_c.ifunc = weak_odr alias void (), ptr @fmv_c
+// CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver
+// CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver
+// CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
+// CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver
+// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
+// CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
 //.
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv._MrngMflagmMfp16fml
@@ -105,6 +112,32 @@ int hoo(void) {
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two()
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-NEXT:    ret i32 [[ADD3]]
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -183,42 +216,16 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv.default
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64
-// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_one.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_one._MsimdMls64
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_two.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_two._MsimdMfp16
 //
 //
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one.ifunc()
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two.ifunc()
-// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
-// CHECK-NEXT:    ret i32 [[ADD3]]
-//
-//
 // CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_e._Mls64
@@ -238,28 +245,14 @@ int hoo(void) {
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_c.default
-//
-//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
 // CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e.ifunc()
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d.ifunc()
-// CHECK-NEXT:    call void @fmv_c.ifunc()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_d()
+// CHECK-NEXT:    call void @fmv_c()
 // CHECK-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
 // CHECK-NEXT:    ret i32 [[CALL3]]
 //
@@ -412,6 +405,20 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv_d.default
 //
 //
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_c.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
 // CHECK-SAME: () #[[ATTR2]] {
@@ -437,9 +444,9 @@ int hoo(void) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    call void @f(ptr noundef @fmv.ifunc)
-// CHECK-NEXT:    store ptr @fmv.ifunc, ptr [[FP1]], align 8
-// CHECK-NEXT:    store ptr @fmv.ifunc, ptr [[FP2]], align 8
+// CHECK-NEXT:    call void @f(ptr noundef @fmv)
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP1]], align 8
+// CHECK-NEXT:    store ptr @fmv, ptr [[FP2]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 [[TMP0]]()
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8
@@ -561,13 +568,6 @@ int hoo(void) {
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
-// CHECK-SAME: () #[[ATTR2]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msha1MpmullMf64mm
 // CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
@@ -700,6 +700,13 @@ int hoo(void) {
 // CHECK-NEXT:    ret i32 1
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
 // CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index ad7c27f2d60d..4dba7670b5c4 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -44,6 +44,14 @@ __device__ int read_tid() {
 
 }
 
+__device__ bool reflect() {
+
+// CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}})
+
+  unsigned x = __nvvm_reflect("__CUDA_ARCH");
+  return x >= 700;
+}
+
 __device__ int read_ntid() {
 
 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index ed03233b6f1a..88282120283b 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -496,6 +496,12 @@ long long test_builtin_readcyclecounter(void) {
   return __builtin_readcyclecounter();
 }
 
+// CHECK-LABEL: define{{.*}} i64 @test_builtin_readsteadycounter
+long long test_builtin_readsteadycounter(void) {
+  // CHECK: call i64 @llvm.readsteadycounter()
+  return __builtin_readsteadycounter();
+}
+
 /// __builtin_launder should be a NOP in C since there are no vtables.
 // CHECK-LABEL: define{{.*}} void @test_builtin_launder
 void test_builtin_launder(int *p) {
diff --git a/clang/test/CodeGen/c11atomics.c b/clang/test/CodeGen/c11atomics.c
index dd1f52f70ae0..4da36ad4da0f 100644
--- a/clang/test/CodeGen/c11atomics.c
+++ b/clang/test/CodeGen/c11atomics.c
@@ -343,10 +343,9 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8
   // CHECK:   store ptr %addr, ptr [[ADDR_ARG]], align 4
   // CHECK:   [[ADDR:%.*]] = load ptr, ptr [[ADDR_ARG]], align 4
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_load_8(ptr noundef [[ADDR]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
-
+  // CHECK:   [[ATOMIC_RES:%.*]] = load atomic i64, ptr [[ADDR]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_ADDR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_ADDR]], i32 6, i1 false)
   return __c11_atomic_load(addr, 5);
 }
 
@@ -362,8 +361,8 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   call arm_aapcscc void @__atomic_store_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   store atomic i64 [[ATOMIC]], ptr [[ADDR]] seq_cst, align 8
   __c11_atomic_store(addr, *val, 5);
 }
 
@@ -380,10 +379,10 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_exchange_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   [[ATOMIC_RES:%.*]] = atomicrmw xchg ptr [[ADDR]], i64 [[ATOMIC]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_PTR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_PTR]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -404,9 +403,10 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[NEW]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_DESIRED]], ptr align 2 [[DESIRED]], i64 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_NEW]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[NEW64:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc zeroext i1 @__atomic_compare_exchange_8(ptr noundef [[ADDR]], ptr noundef [[ATOMIC_DESIRED]], i64 noundef [[NEW64]], i32 noundef 5, i32 noundef 5)
-  // CHECK:   ret i1 [[RES]]
+  // CHECK:   [[VAL1:%.*]] = load i64, ptr [[ATOMIC_DESIRED]], align 8
+  // CHECK:   [[VAL2:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 8
+  // CHECK:   [[RES_PAIR:%.*]] = cmpxchg ptr [[ADDR]], i64 [[VAL1]], i64 [[VAL2]] seq_cst seq_cst, align 8
+  // CHECK:   [[RES:%.*]] = extractvalue { i64, i1 } [[RES_PAIR]], 1
   return __c11_atomic_compare_exchange_strong(addr, desired, *new, 5, 5);
 }
 
@@ -414,12 +414,12 @@ struct Empty {};
 
 struct Empty test_empty_struct_load(_Atomic(struct Empty)* empty) {
   // CHECK-LABEL: @test_empty_struct_load(
-  // CHECK: call arm_aapcscc zeroext i8 @__atomic_load_1(ptr noundef %{{.*}}, i32 noundef 5)
+  // CHECK: load atomic i8, ptr {{.*}}, align 1
   return __c11_atomic_load(empty, 5);
 }
 
 void test_empty_struct_store(_Atomic(struct Empty)* empty, struct Empty value) {
   // CHECK-LABEL: @test_empty_struct_store(
-  // CHECK: call arm_aapcscc void @__atomic_store_1(ptr noundef %{{.*}}, i8 noundef zeroext %{{.*}}, i32 noundef 5)
+  // CHECK: store atomic i8 {{.*}}, ptr {{.*}}, align 1
   __c11_atomic_store(empty, value, 5);
 }
diff --git a/clang/test/CodeGen/fp128_complex.c b/clang/test/CodeGen/fp128_complex.c
index 0e87cbe8ce81..d1593fae9c9b 100644
--- a/clang/test/CodeGen/fp128_complex.c
+++ b/clang/test/CodeGen/fp128_complex.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target aarch64-linux-gnueabi %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang --target=aarch64 %s -S -emit-llvm -o - | FileCheck %s
 
 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
diff --git a/clang/test/CodeGen/fp16-ops-strictfp.c b/clang/test/CodeGen/fp16-ops-strictfp.c
index aa096fc79695..25753e5b98be 100644
--- a/clang/test/CodeGen/fp16-ops-strictfp.c
+++ b/clang/test/CodeGen/fp16-ops-strictfp.c
@@ -1,10 +1,10 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple arm-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK -vv -dump-input=fail
-// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
+// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64 %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple x86_64-linux-gnu %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - -triple arm-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF --check-prefix=CHECK
-// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
+// RUN: %clang_cc1 -ffp-exception-behavior=maytrap -emit-llvm -o - -triple aarch64 -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF --check-prefix=CHECK
 //
 // Test that the constrained intrinsics are picking up the exception
diff --git a/clang/test/CodeGen/fp16-ops.c b/clang/test/CodeGen/fp16-ops.c
index 0626e0aaed9d..bfa2a2f7f6c8 100644
--- a/clang/test/CodeGen/fp16-ops.c
+++ b/clang/test/CodeGen/fp16-ops.c
@@ -1,10 +1,10 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -emit-llvm -o - -triple arm-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
-// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
+// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64 %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -emit-llvm -o - -triple x86_64-linux-gnu %s | FileCheck %s --check-prefix=NOTNATIVE --check-prefix=CHECK
 // RUN: %clang_cc1 -emit-llvm -o - -triple arm-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
-// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
+// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64 -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 // RUN: %clang_cc1 -emit-llvm -o - -x renderscript %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
diff --git a/clang/test/CodeGenCXX/atomic-inline.cpp b/clang/test/CodeGenCXX/atomic-inline.cpp
index 701bbd57b485..c8fa877a37be 100644
--- a/clang/test/CodeGenCXX/atomic-inline.cpp
+++ b/clang/test/CodeGenCXX/atomic-inline.cpp
@@ -42,7 +42,7 @@ AM16 m16;
 AM16 load16() {
   AM16 am;
   // CHECK-LABEL: @_Z6load16v
-  // CHECK: call void @__atomic_load
+  // CHECK: load atomic i128, {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z6load16v
   // CORE2: load atomic i128, {{.*}} monotonic, align 16
   __atomic_load(&m16, &am, 0);
@@ -52,7 +52,7 @@ AM16 load16() {
 AM16 s16;
 void store16() {
   // CHECK-LABEL: @_Z7store16v
-  // CHECK: call void @__atomic_store
+  // CHECK: store atomic i128 {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z7store16v
   // CORE2: store atomic i128 {{.*}} monotonic, align 16
   __atomic_store(&m16, &s16, 0);
@@ -61,7 +61,7 @@ void store16() {
 bool cmpxchg16() {
   AM16 am;
   // CHECK-LABEL: @_Z9cmpxchg16v
-  // CHECK: call noundef zeroext i1 @__atomic_compare_exchange
+  // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   // CORE2-LABEL: @_Z9cmpxchg16v
   // CORE2: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   return __atomic_compare_exchange(&m16, &s16, &am, 0, 0, 0);
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index 68dd7be1180b..b63815db7e40 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -26,9 +26,12 @@ int bar() {
 
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_Z3fooi.ifunc = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
-// CHECK: @_Z3foov.ifunc = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
-// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
+// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr alias i32 (ptr, i32), ptr @_ZN7MyClass3gooEi
+// CHECK: @_Z3fooi.ifunc = weak_odr alias i32 (i32), ptr @_Z3fooi
+// CHECK: @_Z3foov.ifunc = weak_odr alias i32 (), ptr @_Z3foov
+// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
+// CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
+// CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
 //.
 // CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64(
 // CHECK-NEXT:  entry:
@@ -37,39 +40,11 @@ int bar() {
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: @_Z3fooi.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3fooi.default
-//
-//
 // CHECK-LABEL: @_Z3foov._Msm4Mebf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: @_Z3foov.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3foov.default
-//
-//
 // CHECK-LABEL: @_ZN7MyClass3gooEi.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -95,24 +70,40 @@ int bar() {
 // CHECK-LABEL: @_Z3barv(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi.ifunc(i32 noundef 1)
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi(i32 noundef 1)
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov.ifunc()
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov()
 // CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NEXT:    ret i32 [[ADD3]]
 //
 //
-// CHECK-LABEL: @_Z3fooi.default(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
-// CHECK-NEXT:    ret i32 2
+// CHECK-LABEL: @_Z3fooi.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3fooi.default
 //
 //
-// CHECK-LABEL: @_Z3foov.default(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 4
+// CHECK-LABEL: @_Z3foov.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3foov.default
 //
 //
 // CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod(
@@ -144,6 +135,18 @@ int bar() {
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
 //
+//
+// CHECK-LABEL: @_Z3fooi.default(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK-LABEL: @_Z3foov.default(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
 //.
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
index 0234e41009f6..d9f5e3eacac3 100644
--- a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
+++ b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple aarch64-arm-none-eabi %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple aarch64 %s -o - | FileCheck %s
 
 struct S0 {
   unsigned int x : 16;
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index 2b9c66d8fc17..74b3f59bf760 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
 
 RWBuffer<float> Buf;
 
@@ -7,3 +8,6 @@ RWBuffer<float> Buf;
 
 // CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
 // CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
new file mode 100644
index 000000000000..d046efaf9c1f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+int shl32(int V, int S) {
+  return V << S;
+}
+
+// CHECK: define noundef i32 @"?shl32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
+
+int shr32(int V, int S) {
+  return V >> S;
+}
+
+// CHECK: define noundef i32 @"?shr32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
+
+int64_t shl64(int64_t V, int64_t S) {
+  return V << S;
+}
+
+// CHECK: define noundef i64 @"?shl64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
+
+int64_t shr64(int64_t V, int64_t S) {
+  return V >> S;
+}
+
+// CHECK: define noundef i64 @"?shr64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
diff --git a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
index 2f020c210821..d615ff6bec41 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
@@ -20,63 +20,60 @@ typedef enum memory_scope {
 
 void f(atomic_int *i, global atomic_int *gi, local atomic_int *li, private atomic_int *pi, atomic_uint *ui, int cmp, int order, int scope) {
   int x;
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: load atomic i32, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: load atomic i32, ptr {{.*}} seq_cst, align 4
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(1) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(1) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(gi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(3) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(3) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(li, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(pi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw add ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw add ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_add(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw min ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw min ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw umin ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw umin ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(ui, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_strong(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_all_svm_devices);
 
 #ifdef cl_khr_subgroups
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 4)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_sub_group);
 #endif
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, order, order, scope);
 }
diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl
new file mode 100644
index 000000000000..9ae4a5f027d3
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/reflect.cl
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local zeroext i1 @device_function(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.reflect(ptr addrspacecast (ptr addrspace(4) @.str to ptr))
+// CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[TMP0]], 700
+// CHECK-NEXT:    ret i1 [[CMP]]
+//
+bool device_function() {
+  return __nvvm_reflect("__CUDA_ARCH") >= 700;
+}
+
+// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CALL]] to i32
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+__kernel void kernel_function(__global int *i) {
+  *i = device_function();
+}
diff --git a/clang/test/Driver/aarch64-cssc.c b/clang/test/Driver/aarch64-cssc.c
index a3e18663279b..5df0ea79d7c8 100644
--- a/clang/test/Driver/aarch64-cssc.c
+++ b/clang/test/Driver/aarch64-cssc.c
@@ -9,6 +9,7 @@
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a        %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+cssc   %s 2>&1 | FileCheck %s
 // RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -march=armv9.4-a+nocssc %s 2>&1 | FileCheck %s --check-prefix=NO_CSSC
+// RUN: %clang -S -o - -emit-llvm --target=aarch64-none-elf -mcpu=ampere1b          %s 2>&1 | FileCheck %s
 
 // CHECK: "target-features"="{{.*}},+cssc
 // NO_CSSC: "target-features"="{{.*}},-cssc
diff --git a/clang/test/Driver/aarch64-fix-cortex-a53-835769.c b/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
index d7a2ad911261..84d8c1dde7a7 100644
--- a/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
+++ b/clang/test/Driver/aarch64-fix-cortex-a53-835769.c
@@ -1,8 +1,8 @@
-// RUN: %clang --target=aarch64-linux-eabi %s -### 2>&1 \
+// RUN: %clang --target=aarch64 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DEF %s
-// RUN: %clang --target=aarch64-linux-eabi -mfix-cortex-a53-835769 %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mfix-cortex-a53-835769 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-YES %s
-// RUN: %clang --target=aarch64-linux-eabi -mno-fix-cortex-a53-835769 %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mno-fix-cortex-a53-835769 %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO %s
 
 // RUN: %clang --target=aarch64-linux-androideabi %s -### 2>&1 \
diff --git a/clang/test/Driver/aarch64-mgeneral_regs_only.c b/clang/test/Driver/aarch64-mgeneral_regs_only.c
index 17da9c614a8a..93f1ca95826d 100644
--- a/clang/test/Driver/aarch64-mgeneral_regs_only.c
+++ b/clang/test/Driver/aarch64-mgeneral_regs_only.c
@@ -1,6 +1,6 @@
 // Test the -mgeneral-regs-only option
 
-// RUN: %clang --target=aarch64-linux-eabi -mgeneral-regs-only %s -### 2>&1 \
+// RUN: %clang --target=aarch64 -mgeneral-regs-only %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-FP %s
 // RUN: %clang --target=arm64-linux-eabi -mgeneral-regs-only %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-FP %s
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
index 2e1b530fc989..00281fabba55 100644
--- a/clang/test/Driver/aarch64-pauth-lr.c
+++ b/clang/test/Driver/aarch64-pauth-lr.c
@@ -1,15 +1,15 @@
 // Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
 
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang --target=aarch64 -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
 
 // PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
 // PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
diff --git a/clang/test/Driver/aarch64-target-as-march.s b/clang/test/Driver/aarch64-target-as-march.s
index 59c0ca41cbe2..ae77d08c5ecc 100644
--- a/clang/test/Driver/aarch64-target-as-march.s
+++ b/clang/test/Driver/aarch64-target-as-march.s
@@ -2,42 +2,42 @@
 /// via -Wa or -Xassembler are applied correctly to assembler inputs.
 
 /// Does not apply to non assembly files
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.1-a \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.1-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-1 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.1-a \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.1-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-1 %s
 
 // TARGET-FEATURE-1-NOT: "-target-feature" "+v8.1a"
 
 /// Does apply to assembler input
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-2 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-2 %s
 
 // TARGET-FEATURE-2: "-target-feature" "+v8.2a"
 
 /// No unused argument warnings when there are multiple values
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.1-a -Wa,-march=armv8.2-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.1-a -Wa,-march=armv8.2-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=UNUSED-WARNING %s
 
 // UNUSED-WARNING-NOT: warning: argument unused during compilation
 
 /// Last march to assembler wins
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a -Wa,-march=armv8.1-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a -Wa,-march=armv8.1-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=MULTIPLE-VALUES %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.2-a,-march=armv8.1-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.2-a,-march=armv8.1-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=MULTIPLE-VALUES %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Xassembler -march=armv8.2-a -Xassembler \
+// RUN: %clang --target=aarch64 -### -c -Xassembler -march=armv8.2-a -Xassembler \
 // RUN: -march=armv8.1-a %s 2>&1 | FileCheck --check-prefix=MULTIPLE-VALUES %s
 
 // MULTIPLE-VALUES: "-target-feature" "+v8.1a
 // MULTIPLE-VALUES-NOT: "-target-feature" "+v8.2a
 
 /// march to compiler and assembler, we choose the one suited to the input file type
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.3-a -march=armv8.4-a %s 2>&1 | \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.3-a -march=armv8.4-a %s 2>&1 | \
 // RUN: FileCheck --check-prefix=TARGET-FEATURE-3 %s
-// RUN: %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=armv8.3-a -march=armv8.4-a \
+// RUN: %clang --target=aarch64 -### -c -Wa,-march=armv8.3-a -march=armv8.4-a \
 // RUN: %S/Inputs/wildcard1.c 2>&1 | FileCheck --check-prefix=TARGET-FEATURE-4 %s
 
 // TARGET-FEATURE-3: "-target-feature" "+v8.3a"
@@ -46,9 +46,9 @@
 // TARGET-FEATURE-4-NOT: "-target-feature" "+v8.3a"
 
 // Invalid -march settings
-// RUN: not %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=all %s 2>&1 | \
+// RUN: not %clang --target=aarch64 -### -c -Wa,-march=all %s 2>&1 | \
 // RUN: FileCheck --check-prefix=INVALID-ARCH-1 %s
-// RUN: not %clang --target=aarch64-linux-gnueabi -### -c -Wa,-march=foobar %s 2>&1 | \
+// RUN: not %clang --target=aarch64 -### -c -Wa,-march=foobar %s 2>&1 | \
 // RUN: FileCheck --check-prefix=INVALID-ARCH-2 %s
 
 // INVALID-ARCH-1: error: unsupported argument 'all' to option '-march='
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index 81c22af460d1..3b10444ef71d 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -131,6 +131,11 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1200 -DFAMILY=GFX12
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12
 
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10.1-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_1_generic -DFAMILY=GFX10
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10.3-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_3_generic -DFAMILY=GFX10
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx11_generic -DFAMILY=GFX11
+
 // ARCH-GCN-DAG: #define FP_FAST_FMA 1
 
 // FAST_FMAF-DAG: #define FP_FAST_FMAF 1
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index eeb16ae98eba..6f18ea0615cb 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -115,6 +115,11 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s
 
+// RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx10.1-generic %s 2>&1 | FileCheck --check-prefix=GFX10_1_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx10.3-generic %s 2>&1 | FileCheck --check-prefix=GFX10_3_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefix=GFX11_GENERIC %s
+
 // GCNDEFAULT-NOT: -target-cpu
 // GFX600:    "-target-cpu" "gfx600"
 // GFX601:    "-target-cpu" "gfx601"
@@ -160,3 +165,8 @@
 // GFX1151:   "-target-cpu" "gfx1151"
 // GFX1200:   "-target-cpu" "gfx1200"
 // GFX1201:   "-target-cpu" "gfx1201"
+
+// GFX9_GENERIC:      "-target-cpu" "gfx9-generic"
+// GFX10_1_GENERIC:   "-target-cpu" "gfx10.1-generic"
+// GFX10_3_GENERIC:   "-target-cpu" "gfx10.3-generic"
+// GFX11_GENERIC:     "-target-cpu" "gfx11-generic"
diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c
index ba1be29bb7d1..9177b625729b 100644
--- a/clang/test/Driver/arm-alignment.c
+++ b/clang/test/Driver/arm-alignment.c
@@ -22,13 +22,13 @@
 // RUN: %clang -target armv7-windows -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mstrict-align -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mstrict-align -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mno-unaligned-access -munaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mno-unaligned-access -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 
 // CHECK-UNALIGNED-ARM-NOT: "-target-feature" "+strict-align"
@@ -68,19 +68,19 @@
 // RUN: %clang -target armv6m-netbsd-eabi -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mstrict-align -### %s 2> %t
+// RUN: %clang --target=aarch64 -mstrict-align -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -munaligned-access -mstrict-align -### %s 2> %t
+// RUN: %clang --target=aarch64 -munaligned-access -mstrict-align -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
-// RUN: %clang -target aarch64-none-gnueabi -mkernel -mno-unaligned-access -### %s 2> %t
+// RUN: %clang --target=aarch64 -mkernel -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
 // RUN: %clang -target aarch64-unknown-openbsd -### %s 2> %t
diff --git a/clang/test/Driver/arm-no-neg-immediates.c b/clang/test/Driver/arm-no-neg-immediates.c
index f1e4d5f7906b..26c70c87a74c 100644
--- a/clang/test/Driver/arm-no-neg-immediates.c
+++ b/clang/test/Driver/arm-no-neg-immediates.c
@@ -1,8 +1,8 @@
 // RUN: %clang -target arm-none-gnueabi -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
 // RUN: %clang -target arm-none-gnueabi -mno-neg-immediates -### %s 2>&1 | FileCheck %s
 
-// RUN: %clang -target aarch64-none-gnueabi -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: %clang -target aarch64-none-gnueabi -mno-neg-immediates -### %s 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clang --target=aarch64 -mno-neg-immediates -### %s 2>&1 | FileCheck %s
 
 // CHECK: "-target-feature" "+no-neg-immediates"
 // CHECK-DEFAULT-NOT: "+no-neg-immediates"
diff --git a/clang/test/Driver/hlsl-lang-targets.hlsl b/clang/test/Driver/hlsl-lang-targets.hlsl
index f2f4bba8196b..7ce490a66df5 100644
--- a/clang/test/Driver/hlsl-lang-targets.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets.hlsl
@@ -32,7 +32,7 @@
 // Invalid shader stages
 //
 // RUN: not %clang -target dxil--shadermodel6.2-unknown %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
-// RUN: not %clang -target dxil--shadermodel6.2-invalidenvironment %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
+// RUN: not %clang --target=dxil--shadermodel6.2-invalidenvironment %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV-DRV %s
 // RUN: not %clang -target dxil--shadermodel6.2-eabi %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
 // RUN: not %clang -target dxil--shadermodel6.2-msvc %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
 
@@ -47,6 +47,7 @@
 // CHECK-BAD-OS: error: shader model '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
 // CHECK-NO-ENV: error: shader stage is required as environment in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-ENV: error: shader stage '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
+// CHECK-BAD-ENV-DRV: error: version '{{.*}}' in target triple '{{.*}}' is invalid
 // CHECK-BAD-TARGET: error: HLSL code generation is unsupported for target '{{.*}}'
 
 [shader("pixel")]
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 010001b83d7c..7fd46778ac91 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -172,6 +172,22 @@ __attribute__((visibility("protected"), used)) int x;
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
+// RUN: clang-offload-packager -o %t-lib.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
+// RUN: llvm-ar rcs %t.a %t.o
+// RUN: clang-offload-packager -o %t1.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t1.o -fembed-offload-object=%t1.out
+// RUN: clang-offload-packager -o %t2.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t2.o -fembed-offload-object=%t2.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld -- %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL
+
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index b8efd64cd91f..b3ce5ca307a6 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1750,10 +1750,10 @@
 // RUN:     --target=armv7eb-pc-linux-musleabi -mhard-float \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=aarch64-pc-linux-musleabi \
+// RUN:     --target=aarch64-pc-linux-musl \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64 %s
 // RUN: %clang -### %s -no-pie 2>&1 \
-// RUN:     --target=aarch64_be-pc-linux-musleabi \
+// RUN:     --target=aarch64_be-pc-linux-musl \
 // RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64_BE %s
 // CHECK-MUSL-X86:        "-dynamic-linker" "/lib/ld-musl-i386.so.1"
 // CHECK-MUSL-X86_64:     "-dynamic-linker" "/lib/ld-musl-x86_64.so.1"
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index d3700f71aa7e..a108383e29fb 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -27,6 +27,12 @@
 // DEFAULT-NOT: "-target-feature" "-save-restore"
 // DEFAULT-NOT: "-target-feature" "+save-restore"
 
+// RUN: %clang --target=riscv32-unknown-elf -### %s -mforced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=FORCE-SW-SCS
+// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-forced-sw-shadow-stack 2>&1 | FileCheck %s -check-prefix=NO-FORCE-SW-SCS
+// FORCE-SW-SCS: "-target-feature" "+forced-sw-shadow-stack"
+// NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack"
+// DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack"
+
 // RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-unaligned-access 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
diff --git a/clang/test/Driver/sparc-fixed-register.c b/clang/test/Driver/sparc-fixed-register.c
new file mode 100644
index 000000000000..24880b9c9d86
--- /dev/null
+++ b/clang/test/Driver/sparc-fixed-register.c
@@ -0,0 +1,181 @@
+// RUN: %clang --target=sparc-none-gnu -ffixed-g1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G1 < %t %s
+// CHECK-FIXED-G1: "-target-feature" "+reserve-g1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G2 < %t %s
+// CHECK-FIXED-G2: "-target-feature" "+reserve-g2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G3 < %t %s
+// CHECK-FIXED-G3: "-target-feature" "+reserve-g3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G4 < %t %s
+// CHECK-FIXED-G4: "-target-feature" "+reserve-g4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G5 < %t %s
+// CHECK-FIXED-G5: "-target-feature" "+reserve-g5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G6 < %t %s
+// CHECK-FIXED-G6: "-target-feature" "+reserve-g6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-g7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-G7 < %t %s
+// CHECK-FIXED-G7: "-target-feature" "+reserve-g7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O0 < %t %s
+// CHECK-FIXED-O0: "-target-feature" "+reserve-o0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O1 < %t %s
+// CHECK-FIXED-O1: "-target-feature" "+reserve-o1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O2 < %t %s
+// CHECK-FIXED-O2: "-target-feature" "+reserve-o2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O3 < %t %s
+// CHECK-FIXED-O3: "-target-feature" "+reserve-o3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O4 < %t %s
+// CHECK-FIXED-O4: "-target-feature" "+reserve-o4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-o5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-O5 < %t %s
+// CHECK-FIXED-O5: "-target-feature" "+reserve-o5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L0 < %t %s
+// CHECK-FIXED-L0: "-target-feature" "+reserve-l0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L1 < %t %s
+// CHECK-FIXED-L1: "-target-feature" "+reserve-l1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L2 < %t %s
+// CHECK-FIXED-L2: "-target-feature" "+reserve-l2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L3 < %t %s
+// CHECK-FIXED-L3: "-target-feature" "+reserve-l3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L4 < %t %s
+// CHECK-FIXED-L4: "-target-feature" "+reserve-l4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L5 < %t %s
+// CHECK-FIXED-L5: "-target-feature" "+reserve-l5"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L6 < %t %s
+// CHECK-FIXED-L6: "-target-feature" "+reserve-l6"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-l7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-L7 < %t %s
+// CHECK-FIXED-L7: "-target-feature" "+reserve-l7"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i0 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I0 < %t %s
+// CHECK-FIXED-I0: "-target-feature" "+reserve-i0"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i1 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I1 < %t %s
+// CHECK-FIXED-I1: "-target-feature" "+reserve-i1"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i2 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I2 < %t %s
+// CHECK-FIXED-I2: "-target-feature" "+reserve-i2"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i3 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I3 < %t %s
+// CHECK-FIXED-I3: "-target-feature" "+reserve-i3"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i4 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I4 < %t %s
+// CHECK-FIXED-I4: "-target-feature" "+reserve-i4"
+
+// RUN: %clang --target=sparc-none-gnu -ffixed-i5 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-FIXED-I5 < %t %s
+// CHECK-FIXED-I5: "-target-feature" "+reserve-i5"
+
+// Test multiple of reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-i4 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: < %t %s
+
+// Test all reserve-* options together.
+// RUN: %clang --target=sparc-none-gnu \
+// RUN: -ffixed-g1 \
+// RUN: -ffixed-g2 \
+// RUN: -ffixed-g3 \
+// RUN: -ffixed-g4 \
+// RUN: -ffixed-g5 \
+// RUN: -ffixed-g6 \
+// RUN: -ffixed-g7 \
+// RUN: -ffixed-o0 \
+// RUN: -ffixed-o1 \
+// RUN: -ffixed-o2 \
+// RUN: -ffixed-o3 \
+// RUN: -ffixed-o4 \
+// RUN: -ffixed-o5 \
+// RUN: -ffixed-l0 \
+// RUN: -ffixed-l1 \
+// RUN: -ffixed-l2 \
+// RUN: -ffixed-l3 \
+// RUN: -ffixed-l4 \
+// RUN: -ffixed-l5 \
+// RUN: -ffixed-l6 \
+// RUN: -ffixed-l7 \
+// RUN: -ffixed-i0 \
+// RUN: -ffixed-i1 \
+// RUN: -ffixed-i2 \
+// RUN: -ffixed-i3 \
+// RUN: -ffixed-i4 \
+// RUN: -ffixed-i5 \
+// RUN: -### %s 2> %t
+// RUN: FileCheck \
+// RUN: --check-prefix=CHECK-FIXED-G1 \
+// RUN: --check-prefix=CHECK-FIXED-G2 \
+// RUN: --check-prefix=CHECK-FIXED-G3 \
+// RUN: --check-prefix=CHECK-FIXED-G4 \
+// RUN: --check-prefix=CHECK-FIXED-G5 \
+// RUN: --check-prefix=CHECK-FIXED-G6 \
+// RUN: --check-prefix=CHECK-FIXED-G7 \
+// RUN: --check-prefix=CHECK-FIXED-O0 \
+// RUN: --check-prefix=CHECK-FIXED-O1 \
+// RUN: --check-prefix=CHECK-FIXED-O2 \
+// RUN: --check-prefix=CHECK-FIXED-O3 \
+// RUN: --check-prefix=CHECK-FIXED-O4 \
+// RUN: --check-prefix=CHECK-FIXED-O5 \
+// RUN: --check-prefix=CHECK-FIXED-L0 \
+// RUN: --check-prefix=CHECK-FIXED-L1 \
+// RUN: --check-prefix=CHECK-FIXED-L2 \
+// RUN: --check-prefix=CHECK-FIXED-L3 \
+// RUN: --check-prefix=CHECK-FIXED-L4 \
+// RUN: --check-prefix=CHECK-FIXED-L5 \
+// RUN: --check-prefix=CHECK-FIXED-L6 \
+// RUN: --check-prefix=CHECK-FIXED-L7 \
+// RUN: --check-prefix=CHECK-FIXED-I0 \
+// RUN: --check-prefix=CHECK-FIXED-I1 \
+// RUN: --check-prefix=CHECK-FIXED-I2 \
+// RUN: --check-prefix=CHECK-FIXED-I3 \
+// RUN: --check-prefix=CHECK-FIXED-I4 \
+// RUN: --check-prefix=CHECK-FIXED-I5 \
+// RUN: < %t %s
diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
index 4e105ce3cea5..f73915b28ec2 100644
--- a/clang/test/Driver/tls-dialect.c
+++ b/clang/test/Driver/tls-dialect.c
@@ -3,6 +3,10 @@
 // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s
 
+/// Android supports TLSDESC by default on RISC-V
+/// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above
+// RUN: %clang -### --target=riscv64-android %s 2>&1 | FileCheck --check-prefix=DESC %s
+
 /// LTO
 // RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
 // RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
diff --git a/clang/test/Driver/xros-driver-requires-darwin-host.c b/clang/test/Driver/xros-driver-requires-darwin-host.c
new file mode 100644
index 000000000000..e5bfccae2c20
--- /dev/null
+++ b/clang/test/Driver/xros-driver-requires-darwin-host.c
@@ -0,0 +1,13 @@
+// REQUIRES: system-darwin
+
+// RUN: env XROS_DEPLOYMENT_TARGET=1.0 %clang -arch arm64 -c -### %s 2>&1 | FileCheck %s
+
+// RUN: rm -rf %t.dir
+// RUN: mkdir -p %t.dir/XROS1.0.sdk
+// RUN: %clang -arch arm64 -isysroot %t.dir/XROS1.0.sdk -c -### %s 2>&1 | FileCheck %s
+// RUN: mkdir -p %t.dir/XRSimulator1.0.sdk
+// RUN: %clang -arch arm64 -isysroot %t.dir/XRSimulator1.0.sdk -c -### %s 2>&1 | FileCheck --check-prefix=CHECK_SIM %s
+
+
+// CHECK: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0"
+// CHECK_SIM: "-cc1"{{.*}} "-triple" "arm64-apple-xros1.0.0-simulator"
diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m
index b22ff7b3328c..d81711a84d79 100644
--- a/clang/test/Format/dump-config-objc-stdin.m
+++ b/clang/test/Format/dump-config-objc-stdin.m
@@ -1,5 +1,8 @@
+// RUN: clang-format -assume-filename=foo.m -dump-config | FileCheck %s
+
 // RUN: clang-format -dump-config - < %s | FileCheck %s
 
 // CHECK: Language: ObjC
+
 @interface Foo
 @end
diff --git a/clang/test/Format/verbose.cpp b/clang/test/Format/verbose.cpp
index dd625e3f67e5..4ab03d8f62ae 100644
--- a/clang/test/Format/verbose.cpp
+++ b/clang/test/Format/verbose.cpp
@@ -1,12 +1,6 @@
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format -verbose 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
-// RUN: clang-format %s -verbose 2> %t.stderr
-// RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
-// RUN: clang-format %s -verbose=false 2> %t.stderr
-// RUN: not grep "Formatting" %t.stderr
-
-int a;
-// RUN: clang-format %s  2> %t.stderr
+// RUN: clang-format %s 2> %t.stderr
 // RUN: not grep "Formatting" %t.stderr
 // RUN: clang-format %s -verbose 2> %t.stderr
 // RUN: grep -E "Formatting (.*)verbose.cpp(.*)" %t.stderr
diff --git a/clang/test/Frontend/embed-bitcode.ll b/clang/test/Frontend/embed-bitcode.ll
index defb2d12f2f0..9b8632d04dd9 100644
--- a/clang/test/Frontend/embed-bitcode.ll
+++ b/clang/test/Frontend/embed-bitcode.ll
@@ -7,7 +7,7 @@
 ; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
 ; RUN:    -fembed-bitcode=marker -x ir %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
-; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnueabi -emit-llvm \
+; RUN: %clang_cc1 -triple aarch64 -emit-llvm \
 ; RUN:    -fembed-bitcode=all -x ir %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
 
diff --git a/clang/test/Frontend/gnu-mcount.c b/clang/test/Frontend/gnu-mcount.c
index e54983ca356a..a6ee4b274593 100644
--- a/clang/test/Frontend/gnu-mcount.c
+++ b/clang/test/Frontend/gnu-mcount.c
@@ -2,36 +2,29 @@
 
 // RUN: %clang -target armv7-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
-// RUN: %clang -target aarch64-unknown-none-gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-none-gnu -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
+// RUN: %clang --target=aarch64-unknown-none-gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
 // RUN: %clang -target armv7-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
 // RUN: %clang -target armv7-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
-// RUN: %clang -target aarch64-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
-// RUN: %clang -target aarch64-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
+// RUN: %clang --target=aarch64-unknown-linux -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
 // RUN: %clang -target armv7-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
 // RUN: %clang -target armv7-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
-// RUN: %clang -target aarch64-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
-// RUN: %clang -target aarch64-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER
 // RUN: %clang -target armv7-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
-// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang --target=aarch64-unknown-freebsd -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
 // RUN: %clang -target armv7-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
 // RUN: %clang -target armv7-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
+// RUN: %clang --target=aarch64-unknown-openbsd -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix UNDER_UNDER
 // RUN: %clang -target armv7-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
-// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
+// RUN: %clang --target=aarch64-unknown-netbsd -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNDER_UNDER
 // RUN: %clang -target armv7-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target arm64-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target arm64-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,UNSUPPORTED
 // RUN: %clang -target armv7-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
 // RUN: %clang -target armv7-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
-// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
+// RUN: %clang --target=aarch64-unknown-rtems -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
+// RUN: %clang --target=aarch64-unknown-rtems -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,MCOUNT
 
 int f() {
   return 0;
diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index c194f4437890..cd085fdb5039 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -61,13 +61,13 @@ extern "C" __device__ float test_fabs_f32(float x) {
 
 // DEFAULT-LABEL: @test_sin_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
-// DEFAULT-NEXT:    ret float [[CALL_I_I]]
+// DEFAULT-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// DEFAULT-NEXT:    ret float [[CALL_I1]]
 //
 // FINITEONLY-LABEL: @test_sin_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
-// FINITEONLY-NEXT:    ret float [[CALL_I_I]]
+// FINITEONLY-NEXT:    [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// FINITEONLY-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_sin_f32(float x) {
   return sin(x);
@@ -75,13 +75,13 @@ extern "C" __device__ float test_sin_f32(float x) {
 
 // DEFAULT-LABEL: @test_cos_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
-// DEFAULT-NEXT:    ret float [[CALL_I_I]]
+// DEFAULT-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
+// DEFAULT-NEXT:    ret float [[CALL_I1]]
 //
 // FINITEONLY-LABEL: @test_cos_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
-// FINITEONLY-NEXT:    ret float [[CALL_I_I]]
+// FINITEONLY-NEXT:    [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
+// FINITEONLY-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_cos_f32(float x) {
   return cos(x);
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 5230c360d0be..37099de74fb8 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -258,17 +258,17 @@ extern "C" __device__ long long test_llabs(long x) {
 
 // DEFAULT-LABEL: @test_acosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acosf(float x) {
@@ -277,17 +277,17 @@ extern "C" __device__ float test_acosf(float x) {
 
 // DEFAULT-LABEL: @test_acos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acos(double x) {
@@ -296,17 +296,17 @@ extern "C" __device__ double test_acos(double x) {
 
 // DEFAULT-LABEL: @test_acoshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acoshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acoshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acoshf(float x) {
@@ -315,17 +315,17 @@ extern "C" __device__ float test_acoshf(float x) {
 
 // DEFAULT-LABEL: @test_acosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acosh(double x) {
@@ -334,17 +334,17 @@ extern "C" __device__ double test_acosh(double x) {
 
 // DEFAULT-LABEL: @test_asinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinf(float x) {
@@ -353,17 +353,17 @@ extern "C" __device__ float test_asinf(float x) {
 
 // DEFAULT-LABEL: @test_asin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asin(double x) {
@@ -373,17 +373,17 @@ extern "C" __device__ double test_asin(double x) {
 
 // DEFAULT-LABEL: @test_asinhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinhf(float x) {
@@ -392,17 +392,17 @@ extern "C" __device__ float test_asinhf(float x) {
 
 // DEFAULT-LABEL: @test_asinh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asinh(double x) {
@@ -411,17 +411,17 @@ extern "C" __device__ double test_asinh(double x) {
 
 // DEFAULT-LABEL: @test_atan2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atan2f(float x, float y) {
@@ -430,17 +430,17 @@ extern "C" __device__ float test_atan2f(float x, float y) {
 
 // DEFAULT-LABEL: @test_atan2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan2(double x, double y) {
@@ -449,17 +449,17 @@ extern "C" __device__ double test_atan2(double x, double y) {
 
 // DEFAULT-LABEL: @test_atanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanf(float x) {
@@ -468,17 +468,17 @@ extern "C" __device__ float test_atanf(float x) {
 
 // DEFAULT-LABEL: @test_atan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan(double x) {
@@ -487,17 +487,17 @@ extern "C" __device__ double test_atan(double x) {
 
 // DEFAULT-LABEL: @test_atanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanhf(float x) {
@@ -506,17 +506,17 @@ extern "C" __device__ float test_atanhf(float x) {
 
 // DEFAULT-LABEL: @test_atanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atanh(double x) {
@@ -525,17 +525,17 @@ extern "C" __device__ double test_atanh(double x) {
 
 // DEFAULT-LABEL: @test_cbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cbrtf(float x) {
@@ -544,17 +544,17 @@ extern "C" __device__ float test_cbrtf(float x) {
 
 // DEFAULT-LABEL: @test_cbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cbrt(double x) {
@@ -639,17 +639,17 @@ extern "C" __device__ double test_copysign(double x, double y) {
 
 // DEFAULT-LABEL: @test_cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_cosf(float x) {
@@ -658,17 +658,17 @@ extern "C" __device__ float test_cosf(float x) {
 
 // DEFAULT-LABEL: @test_cos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cos(double x) {
@@ -677,17 +677,17 @@ extern "C" __device__ double test_cos(double x) {
 
 // DEFAULT-LABEL: @test_coshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_coshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_coshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_coshf(float x) {
@@ -696,17 +696,17 @@ extern "C" __device__ float test_coshf(float x) {
 
 // DEFAULT-LABEL: @test_cosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cosh(double x) {
@@ -715,17 +715,17 @@ extern "C" __device__ double test_cosh(double x) {
 
 // DEFAULT-LABEL: @test_cospif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cospif(float x) {
@@ -734,17 +734,17 @@ extern "C" __device__ float test_cospif(float x) {
 
 // DEFAULT-LABEL: @test_cospi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cospi(double x) {
@@ -753,17 +753,17 @@ extern "C" __device__ double test_cospi(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i0f(float x) {
@@ -772,17 +772,17 @@ extern "C" __device__ float test_cyl_bessel_i0f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i0(double x) {
@@ -791,17 +791,17 @@ extern "C" __device__ double test_cyl_bessel_i0(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i1f(float x) {
@@ -810,17 +810,17 @@ extern "C" __device__ float test_cyl_bessel_i1f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i1(double x) {
@@ -829,17 +829,17 @@ extern "C" __device__ double test_cyl_bessel_i1(double x) {
 
 // DEFAULT-LABEL: @test_erfcf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfcf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfcf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfcf(float x) {
@@ -848,17 +848,17 @@ extern "C" __device__ float test_erfcf(float x) {
 
 // DEFAULT-LABEL: @test_erfc(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfc(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfc(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfc(double x) {
@@ -867,17 +867,17 @@ extern "C" __device__ double test_erfc(double x) {
 
 // DEFAULT-LABEL: @test_erfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfinvf(float x) {
@@ -886,17 +886,17 @@ extern "C" __device__ float test_erfinvf(float x) {
 
 // DEFAULT-LABEL: @test_erfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfinv(double x) {
@@ -905,17 +905,17 @@ extern "C" __device__ double test_erfinv(double x) {
 
 // DEFAULT-LABEL: @test_exp10f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_exp10f(float x) {
@@ -924,17 +924,17 @@ extern "C" __device__ float test_exp10f(float x) {
 
 // DEFAULT-LABEL: @test_exp10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp10(double x) {
@@ -962,17 +962,17 @@ extern "C" __device__ float test_exp2f(float x) {
 
 // DEFAULT-LABEL: @test_exp2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp2(double x) {
@@ -1000,17 +1000,17 @@ extern "C" __device__ float test_expf(float x) {
 
 // DEFAULT-LABEL: @test_exp(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp(double x) {
@@ -1019,17 +1019,17 @@ extern "C" __device__ double test_exp(double x) {
 
 // DEFAULT-LABEL: @test_expm1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_expm1f(float x) {
@@ -1038,17 +1038,17 @@ extern "C" __device__ float test_expm1f(float x) {
 
 // DEFAULT-LABEL: @test_expm1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_expm1(double x) {
@@ -1095,17 +1095,17 @@ extern "C" __device__ double test_fabs(double x) {
 
 // DEFAULT-LABEL: @test_fdimf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdimf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdimf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fdimf(float x, float y) {
@@ -1114,17 +1114,17 @@ extern "C" __device__ float test_fdimf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fdim(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdim(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdim(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fdim(double x, double y) {
@@ -1323,17 +1323,17 @@ extern "C" __device__ double test_fmin(double x, double y) {
 
 // DEFAULT-LABEL: @test_fmodf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmodf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmodf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fmodf(float x, float y) {
@@ -1342,17 +1342,17 @@ extern "C" __device__ float test_fmodf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fmod(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmod(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmod(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fmod(double x, double y) {
@@ -1385,17 +1385,17 @@ extern "C" __device__ double test_frexp(double x, int* y) {
 
 // DEFAULT-LABEL: @test_hypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_hypotf(float x, float y) {
@@ -1404,17 +1404,17 @@ extern "C" __device__ float test_hypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_hypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_hypot(double x, double y) {
@@ -1423,17 +1423,17 @@ extern "C" __device__ double test_hypot(double x, double y) {
 
 // DEFAULT-LABEL: @test_ilogbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogbf(float x) {
@@ -1442,17 +1442,17 @@ extern "C" __device__ int test_ilogbf(float x) {
 
 // DEFAULT-LABEL: @test_ilogb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogb(double x) {
@@ -1589,17 +1589,17 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) {
 
 // DEFAULT-LABEL: @test_j0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j0f(float x) {
@@ -1608,17 +1608,17 @@ extern "C" __device__ float test_j0f(float x) {
 
 // DEFAULT-LABEL: @test_j0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j0(double x) {
@@ -1627,17 +1627,17 @@ extern "C" __device__ double test_j0(double x) {
 
 // DEFAULT-LABEL: @test_j1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j1f(float x) {
@@ -1646,17 +1646,17 @@ extern "C" __device__ float test_j1f(float x) {
 
 // DEFAULT-LABEL: @test_j1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j1(double x) {
@@ -1670,14 +1670,14 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1703,14 +1703,14 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1736,14 +1736,14 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -1773,14 +1773,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1806,14 +1806,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1839,14 +1839,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // APPROX:       for.body.i:
@@ -1909,17 +1909,17 @@ extern "C" __device__ double test_ldexp(double x, int y) {
 
 // DEFAULT-LABEL: @test_lgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_lgammaf(float x) {
@@ -1928,17 +1928,17 @@ extern "C" __device__ float test_lgammaf(float x) {
 
 // DEFAULT-LABEL: @test_lgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_lgamma(double x) {
@@ -2054,17 +2054,17 @@ extern "C" __device__ float test_log10f(float x) {
 
 // DEFAULT-LABEL: @test_log10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log10(double x) {
@@ -2073,17 +2073,17 @@ extern "C" __device__ double test_log10(double x) {
 
 // DEFAULT-LABEL: @test_log1pf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1pf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1pf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_log1pf(float x) {
@@ -2092,17 +2092,17 @@ extern "C" __device__ float test_log1pf(float x) {
 
 // DEFAULT-LABEL: @test_log1p(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1p(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1p(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log1p(double x) {
@@ -2111,12 +2111,12 @@ extern "C" __device__ double test_log1p(double x) {
 
 // DEFAULT-LABEL: @test_log2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2f(
@@ -2130,17 +2130,17 @@ extern "C" __device__ float test_log2f(float x) {
 
 // DEFAULT-LABEL: @test_log2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log2(double x) {
@@ -2149,17 +2149,17 @@ extern "C" __device__ double test_log2(double x) {
 
 // DEFAULT-LABEL: @test_logbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_logbf(float x) {
@@ -2168,17 +2168,17 @@ extern "C" __device__ float test_logbf(float x) {
 
 // DEFAULT-LABEL: @test_logb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_logb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_logb(double x) {
@@ -2187,12 +2187,12 @@ extern "C" __device__ double test_logb(double x) {
 
 // DEFAULT-LABEL: @test_logf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logf(
@@ -2295,31 +2295,31 @@ extern "C" __device__ long int test_lround(double x) {
 // DEFAULT-LABEL: @test_modff(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_modff(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_modff(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_modff(float x, float* y) {
@@ -2329,31 +2329,31 @@ extern "C" __device__ float test_modff(float x, float* y) {
 // DEFAULT-LABEL: @test_modf(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_modf(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_modf(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_modf(double x, double* y) {
@@ -2557,33 +2557,65 @@ extern "C" __device__ double test_nan(const char *tag) {
   return nan(tag);
 }
 
-// CHECK-LABEL: @test_nanf_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_emptystr() {
   return nanf("");
 }
 
-// CHECK-LABEL: @test_nan_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_emptystr() {
   return nan("");
 }
 
-// CHECK-LABEL: @test_nanf_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_fill() {
   return nanf("0x456");
 }
 
-// CHECK-LABEL: @test_nan_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_fill() {
   return nan("0x123");
@@ -2629,17 +2661,17 @@ extern "C" __device__ double test_nearbyint(double x) {
 
 // DEFAULT-LABEL: @test_nextafterf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafterf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafterf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_nextafterf(float x, float y) {
@@ -2648,17 +2680,17 @@ extern "C" __device__ float test_nextafterf(float x, float y) {
 
 // DEFAULT-LABEL: @test_nextafter(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafter(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafter(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_nextafter(double x, double y) {
@@ -2667,17 +2699,17 @@ extern "C" __device__ double test_nextafter(double x, double y) {
 
 // DEFAULT-LABEL: @test_norm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm3df(float x, float y, float z) {
@@ -2686,17 +2718,17 @@ extern "C" __device__ float test_norm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_norm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm3d(double x, double y, double z) {
@@ -2705,17 +2737,17 @@ extern "C" __device__ double test_norm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_norm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
@@ -2724,17 +2756,17 @@ extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_norm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm4d(double x, double y, double z, double w) {
@@ -2743,17 +2775,17 @@ extern "C" __device__ double test_norm4d(double x, double y, double z, double w)
 
 // DEFAULT-LABEL: @test_normcdff(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdff(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdff(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdff(float x) {
@@ -2762,17 +2794,17 @@ extern "C" __device__ float test_normcdff(float x) {
 
 // DEFAULT-LABEL: @test_normcdf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdf(double x) {
@@ -2781,17 +2813,17 @@ extern "C" __device__ double test_normcdf(double x) {
 
 // DEFAULT-LABEL: @test_normcdfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdfinvf(float x) {
@@ -2800,17 +2832,17 @@ extern "C" __device__ float test_normcdfinvf(float x) {
 
 // DEFAULT-LABEL: @test_normcdfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdfinv(double x) {
@@ -2947,17 +2979,17 @@ extern "C" __device__ double test_norm(int x, const double *y) {
 
 // DEFAULT-LABEL: @test_powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powf(float x, float y) {
@@ -2966,17 +2998,17 @@ extern "C" __device__ float test_powf(float x, float y) {
 
 // DEFAULT-LABEL: @test_pow(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_pow(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_pow(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_pow(double x, double y) {
@@ -2985,17 +3017,17 @@ extern "C" __device__ double test_pow(double x, double y) {
 
 // DEFAULT-LABEL: @test_powif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powif(float x, int y) {
@@ -3004,17 +3036,17 @@ extern "C" __device__ float test_powif(float x, int y) {
 
 // DEFAULT-LABEL: @test_powi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_powi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_powi(double x, int y) {
@@ -3023,17 +3055,17 @@ extern "C" __device__ double test_powi(double x, int y) {
 
 // DEFAULT-LABEL: @test_rcbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rcbrtf(float x) {
@@ -3042,17 +3074,17 @@ extern "C" __device__ float test_rcbrtf(float x) {
 
 // DEFAULT-LABEL: @test_rcbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rcbrt(double x) {
@@ -3061,17 +3093,17 @@ extern "C" __device__ double test_rcbrt(double x) {
 
 // DEFAULT-LABEL: @test_remainderf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainderf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainderf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remainderf(float x, float y) {
@@ -3080,17 +3112,17 @@ extern "C" __device__ float test_remainderf(float x, float y) {
 
 // DEFAULT-LABEL: @test_remainder(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainder(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainder(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remainder(double x, double y) {
@@ -3100,31 +3132,31 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // DEFAULT-LABEL: @test_remquof(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remquof(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_remquof(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remquof(float x, float y, int* z) {
@@ -3134,31 +3166,31 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // DEFAULT-LABEL: @test_remquo(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remquo(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_remquo(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remquo(double x, double y, int* z) {
@@ -3167,17 +3199,17 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) {
 
 // DEFAULT-LABEL: @test_rhypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rhypotf(float x, float y) {
@@ -3186,17 +3218,17 @@ extern "C" __device__ float test_rhypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_rhypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rhypot(double x, double y) {
@@ -3258,7 +3290,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
 // DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnormf(
@@ -3278,7 +3310,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
 // FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnormf(
@@ -3298,7 +3330,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
 // APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnormf(int x, const float* y) {
@@ -3322,7 +3354,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
 // DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm(
@@ -3342,7 +3374,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
 // FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm(
@@ -3362,7 +3394,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
 // APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm(int x, const double* y) {
@@ -3371,17 +3403,17 @@ extern "C" __device__ double test_rnorm(int x, const double* y) {
 
 // DEFAULT-LABEL: @test_rnorm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
@@ -3390,17 +3422,17 @@ extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_rnorm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
@@ -3409,17 +3441,17 @@ extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_rnorm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
@@ -3428,17 +3460,17 @@ extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_rnorm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) {
@@ -3485,17 +3517,17 @@ extern "C" __device__ double test_round(double x) {
 
 // DEFAULT-LABEL: @test_rsqrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rsqrtf(float x) {
@@ -3504,17 +3536,17 @@ extern "C" __device__ float test_rsqrtf(float x) {
 
 // DEFAULT-LABEL: @test_rsqrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rsqrt(double x) {
@@ -3530,7 +3562,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // DEFAULT:       cond.false.i:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]]
 // DEFAULT-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // DEFAULT:       _ZL8scalblnffl.exit:
 // DEFAULT-NEXT:    [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3545,7 +3577,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // FINITEONLY:       cond.false.i:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_scalb_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]]
 // FINITEONLY-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // FINITEONLY:       _ZL8scalblnffl.exit:
 // FINITEONLY-NEXT:    [[COND_I:%.*]] = phi nnan ninf contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3560,7 +3592,7 @@ extern "C" __device__ double test_rsqrt(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]])
 // APPROX-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT:%.*]]
 // APPROX:       cond.false.i:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]]
 // APPROX-NEXT:    br label [[_ZL8SCALBLNFFL_EXIT]]
 // APPROX:       _ZL8scalblnffl.exit:
 // APPROX-NEXT:    [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3579,7 +3611,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // DEFAULT-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // DEFAULT:       cond.false.i:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]]
 // DEFAULT-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // DEFAULT:       _ZL7scalblndl.exit:
 // DEFAULT-NEXT:    [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3594,7 +3626,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // FINITEONLY-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // FINITEONLY:       cond.false.i:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_scalb_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) 0x43E0000000000000) #[[ATTR12]]
 // FINITEONLY-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // FINITEONLY:       _ZL7scalblndl.exit:
 // FINITEONLY-NEXT:    [[COND_I:%.*]] = phi nnan ninf contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3609,7 +3641,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]])
 // APPROX-NEXT:    br label [[_ZL7SCALBLNDL_EXIT:%.*]]
 // APPROX:       cond.false.i:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]]
 // APPROX-NEXT:    br label [[_ZL7SCALBLNDL_EXIT]]
 // APPROX:       _ZL7scalblndl.exit:
 // APPROX-NEXT:    [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ]
@@ -3681,34 +3713,34 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) {
 // DEFAULT-LABEL: @test_sincosf(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincosf(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincos_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincosf(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincosf(float x, float *y, float *z) {
@@ -3718,34 +3750,34 @@ extern "C" __device__ void test_sincosf(float x, float *y, float *z) {
 // DEFAULT-LABEL: @test_sincos(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincos(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincos_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincos(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincos(double x, double *y, double *z) {
@@ -3755,34 +3787,34 @@ extern "C" __device__ void test_sincos(double x, double *y, double *z) {
 // DEFAULT-LABEL: @test_sincospif(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincospif(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_sincospi_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincospif(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincospif(float x, float *y, float *z) {
@@ -3792,34 +3824,34 @@ extern "C" __device__ void test_sincospif(float x, float *y, float *z) {
 // DEFAULT-LABEL: @test_sincospi(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test_sincospi(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_sincospi_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test_sincospi(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
-// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
 // APPROX-NEXT:    store double [[CALL_I]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Z:%.*]], align 8, !tbaa [[TBAA18]]
-// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
+// APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret void
 //
 extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
@@ -3828,17 +3860,17 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
 
 // DEFAULT-LABEL: @test_sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_sinf(float x) {
@@ -3847,17 +3879,17 @@ extern "C" __device__ float test_sinf(float x) {
 
 // DEFAULT-LABEL: @test_sin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sin(double x) {
@@ -3866,17 +3898,17 @@ extern "C" __device__ double test_sin(double x) {
 
 // DEFAULT-LABEL: @test_sinpif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_sinpif(float x) {
@@ -3885,17 +3917,17 @@ extern "C" __device__ float test_sinpif(float x) {
 
 // DEFAULT-LABEL: @test_sinpi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sinpi(double x) {
@@ -3942,17 +3974,17 @@ extern "C" __device__ double test_sqrt(double x) {
 
 // DEFAULT-LABEL: @test_tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanf(float x) {
@@ -3961,17 +3993,17 @@ extern "C" __device__ float test_tanf(float x) {
 
 // DEFAULT-LABEL: @test_tan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tan(double x) {
@@ -3980,17 +4012,17 @@ extern "C" __device__ double test_tan(double x) {
 
 // DEFAULT-LABEL: @test_tanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanhf(float x) {
@@ -3999,17 +4031,17 @@ extern "C" __device__ float test_tanhf(float x) {
 
 // DEFAULT-LABEL: @test_tanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tanh(double x) {
@@ -4018,17 +4050,17 @@ extern "C" __device__ double test_tanh(double x) {
 
 // DEFAULT-LABEL: @test_tgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tgammaf(float x) {
@@ -4037,17 +4069,17 @@ extern "C" __device__ float test_tgammaf(float x) {
 
 // DEFAULT-LABEL: @test_tgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tgamma(double x) {
@@ -4094,17 +4126,17 @@ extern "C" __device__ double test_trunc(double x) {
 
 // DEFAULT-LABEL: @test_y0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y0f(float x) {
@@ -4113,17 +4145,17 @@ extern "C" __device__ float test_y0f(float x) {
 
 // DEFAULT-LABEL: @test_y0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y0(double x) {
@@ -4132,17 +4164,17 @@ extern "C" __device__ double test_y0(double x) {
 
 // DEFAULT-LABEL: @test_y1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y1f(float x) {
@@ -4151,17 +4183,17 @@ extern "C" __device__ float test_y1f(float x) {
 
 // DEFAULT-LABEL: @test_y1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y1(double x) {
@@ -4175,14 +4207,14 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4208,14 +4240,14 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4241,14 +4273,14 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -4278,14 +4310,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4311,14 +4343,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4344,14 +4376,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:      i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]]
 // APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // APPROX:       for.body.i:
@@ -4376,17 +4408,17 @@ extern "C" __device__ double test_yn(int x, double y) {
 
 // DEFAULT-LABEL: @test___cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___cosf(float x) {
@@ -4553,17 +4585,17 @@ extern "C" __device__ float test___frsqrt_rn(float x) {
 
 // DEFAULT-LABEL: @test___fsqrt_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___fsqrt_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR12]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___fsqrt_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rn(float x) {
@@ -4648,17 +4680,17 @@ extern "C" __device__ float test___logf(float x) {
 
 // DEFAULT-LABEL: @test___powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR13]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___powf(float x, float y) {
@@ -4695,25 +4727,25 @@ extern "C" __device__ float test___saturatef(float x) {
 
 // DEFAULT-LABEL: @test___sincosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// DEFAULT-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    ret void
 //
 // FINITEONLY-LABEL: @test___sincosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// FINITEONLY-NEXT:    [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL1_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    ret void
 //
 // APPROX-LABEL: @test___sincosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL_I]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
-// APPROX-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL1_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    ret void
 //
@@ -4723,17 +4755,17 @@ extern "C" __device__ void test___sincosf(float x, float *y, float *z) {
 
 // DEFAULT-LABEL: @test___sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___sinf(float x) {
@@ -4742,24 +4774,24 @@ extern "C" __device__ float test___sinf(float x) {
 
 // DEFAULT-LABEL: @test___tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
 // FINITEONLY-LABEL: @test___tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
 // APPROX-LABEL: @test___tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
diff --git a/clang/test/Headers/arm-acle-header.c b/clang/test/Headers/arm-acle-header.c
index 1084a8cf87e1..f04c7e1f0f35 100644
--- a/clang/test/Headers/arm-acle-header.c
+++ b/clang/test/Headers/arm-acle-header.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -triple armv7-eabi -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
-// RUN: %clang_cc1 -triple aarch64-eabi -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
+// RUN: %clang_cc1 -triple aarch64 -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -triple thumbv7-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple armv7-eabi -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-eabi -target-cpu cortex-a57 -fsyntax-only -ffreestanding %s
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-cpu cortex-a57 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple thumbv7-windows -target-cpu cortex-a15 -fsyntax-only -ffreestanding %s
 // RUN: %clang_cc1 -x c++ -triple thumbv7-windows -target-cpu cortex-a15 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // RUN: %clang_cc1 -x c++ -triple aarch64-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
diff --git a/clang/test/Headers/xmmintrin-unsupported.c b/clang/test/Headers/xmmintrin-unsupported.c
index 1991a132f451..870be81ff29a 100644
--- a/clang/test/Headers/xmmintrin-unsupported.c
+++ b/clang/test/Headers/xmmintrin-unsupported.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 %s -triple aarch64-eabi -fsyntax-only 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 %s -triple aarch64 -fsyntax-only 2>&1 | FileCheck %s
 //
 // REQUIRES: x86-registered-target
 // CHECK: This header is only meant to be used on x86 and x64 architecture
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 2f10bfb1fd82..123b203af3e9 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
@@ -37,7 +37,7 @@
 
 // RUN: not %clang_cc1 -triple amdgcn--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AMDGCN
 // AMDGCN: error: unknown target CPU 'not-a-cpu'
-// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
+// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201, gfx9-generic, gfx10.1-generic, gfx10.3-generic, gfx11-generic{{$}}
 
 // RUN: not %clang_cc1 -triple wasm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix WEBASM
 // WEBASM: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/OpenMP/for_loop_auto.cpp b/clang/test/OpenMP/for_loop_auto.cpp
index b2c5540a7785..4467de6bba18 100644
--- a/clang/test/OpenMP/for_loop_auto.cpp
+++ b/clang/test/OpenMP/for_loop_auto.cpp
@@ -10,7 +10,7 @@
 #ifndef HEADER
 #define HEADER
 
-// CHECK:      template <> void do_loop(const auto &v) {
+// CHECK:      void do_loop(const auto &v) {
 // CHECK-NEXT: #pragma omp parallel for
 // CHECK-NEXT:    for (const auto &i : v)
 // CHECK-NEXT:      ;
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 41fb26e04ef6..6ec4dcd60cf6 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -244,16 +244,16 @@
 
 // On ARMv8.2-A and above, +fp16fml implies +fp16.
 // On ARMv8.4-A and above, +fp16 implies +fp16fml.
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
 // CHECK-FULLFP16-FML:           #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-NOFML-NOT:     #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
@@ -263,24 +263,24 @@
 
 // +fp16fml+nosimd doesn't make sense as the fp16fml instructions all require SIMD.
 // However, as +fp16fml implies +fp16 there is a set of defines that we would expect.
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
 // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FP 0xE
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FP16_FORMAT_IEEE 1
 
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.2-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// RUN: %clang --target=aarch64 -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_FML 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
@@ -600,14 +600,14 @@
 // CHECK-NOSYS128-NOT: __ARM_FEATURE_SYSREG128 1
 
 // ================== Check Armv8.9-A/Armv9.4-A Guarded Control Stack (FEAT_GCS)
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.9-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
-// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.4-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a     -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a+gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-NOGCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a     -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-NOGCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv8.9-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
+// RUN: %clang --target=aarch64 -march=armv9.4-a+gcs -mbranch-protection=gcs -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-GCS,CHECK-GCS-DEFAULT %s
 // CHECK-GCS: __ARM_FEATURE_GCS 1
 // CHECK-NOGCS-NOT: __ARM_FEATURE_GCS 1
 // CHECK-GCS-DEFAULT: __ARM_FEATURE_GCS_DEFAULT 1
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index 236c9f2479b7..733d068b09b1 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -737,7 +737,7 @@
 
 // Test whether predefines are as expected when targeting cortex-m55 (softfp FP ABI as default).
 // RUN: %clang -target arm-eabi -mcpu=cortex-m55 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M55 %s
-// M55: #define __ARM_ARCH 8
+// M55: #define __ARM_ARCH 801
 // M55: #define __ARM_ARCH_8_1M_MAIN__ 1
 // M55: #define __ARM_ARCH_EXT_IDIV__ 1
 // M55-NOT: __ARM_ARCH_ISA_ARM
@@ -764,7 +764,7 @@
 // KRAIT-ALLOW-FP-INSTR:#define  __ARM_VFPV4__ 1
 
 // RUN: %clang -target arm-arm-none-eabi -march=armv8.1-m.main -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81M %s
-// CHECK-V81M: #define __ARM_ARCH 8
+// CHECK-V81M: #define __ARM_ARCH 801
 // CHECK-V81M: #define __ARM_ARCH_8_1M_MAIN__ 1
 // CHECK-V81M: #define __ARM_ARCH_ISA_THUMB 2
 // CHECK-V81M: #define __ARM_ARCH_PROFILE 'M'
@@ -821,14 +821,14 @@
 // CHECK-V8M-CDE-MASK2: #define __ARM_FEATURE_CDE_COPROC 0xff
 
 // RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81A %s
-// CHECK-V81A: #define __ARM_ARCH 8
+// CHECK-V81A: #define __ARM_ARCH 801
 // CHECK-V81A: #define __ARM_ARCH_8_1A__ 1
 // CHECK-V81A: #define __ARM_ARCH_PROFILE 'A'
 // CHECK-V81A: #define __ARM_FEATURE_QRDMX 1
 // CHECK-V81A: #define __ARM_FP 0xe
 
 // RUN: %clang -target armv8.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V82A %s
-// CHECK-V82A: #define __ARM_ARCH 8
+// CHECK-V82A: #define __ARM_ARCH 802
 // CHECK-V82A: #define __ARM_ARCH_8_2A__ 1
 // CHECK-V82A: #define __ARM_ARCH_PROFILE 'A'
 // CHECK-V82A: #define __ARM_FEATURE_QRDMX 1
@@ -838,67 +838,67 @@
 // CHECK-DRIVERKIT-NOT: #define __ARM_PCS_VFP 1
 
 // RUN: %clang -target armv8.3a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V83A %s
-// CHECK-V83A: #define __ARM_ARCH 8
+// CHECK-V83A: #define __ARM_ARCH 803
 // CHECK-V83A: #define __ARM_ARCH_8_3A__ 1
 // CHECK-V83A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.4a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V84A %s
-// CHECK-V84A: #define __ARM_ARCH 8
+// CHECK-V84A: #define __ARM_ARCH 804
 // CHECK-V84A: #define __ARM_ARCH_8_4A__ 1
 // CHECK-V84A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.5a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V85A %s
-// CHECK-V85A: #define __ARM_ARCH 8
+// CHECK-V85A: #define __ARM_ARCH 805
 // CHECK-V85A: #define __ARM_ARCH_8_5A__ 1
 // CHECK-V85A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.6a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V86A %s
-// CHECK-V86A: #define __ARM_ARCH 8
+// CHECK-V86A: #define __ARM_ARCH 806
 // CHECK-V86A: #define __ARM_ARCH_8_6A__ 1
 // CHECK-V86A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.7a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V87A %s
-// CHECK-V87A: #define __ARM_ARCH 8
+// CHECK-V87A: #define __ARM_ARCH 807
 // CHECK-V87A: #define __ARM_ARCH_8_7A__ 1
 // CHECK-V87A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.8a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V88A %s
-// CHECK-V88A: #define __ARM_ARCH 8
+// CHECK-V88A: #define __ARM_ARCH 808
 // CHECK-V88A: #define __ARM_ARCH_8_8A__ 1
 // CHECK-V88A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv8.9a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V89A %s
-// CHECK-V89A: #define __ARM_ARCH 8
+// CHECK-V89A: #define __ARM_ARCH 809
 // CHECK-V89A: #define __ARM_ARCH_8_9A__ 1
 // CHECK-V89A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V9A %s
-// CHECK-V9A: #define __ARM_ARCH 9
+// CHECK-V9A: #define __ARM_ARCH 900
 // CHECK-V9A: #define __ARM_ARCH_9A__ 1
 // CHECK-V9A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V91A %s
-// CHECK-V91A: #define __ARM_ARCH 9
+// CHECK-V91A: #define __ARM_ARCH 901
 // CHECK-V91A: #define __ARM_ARCH_9_1A__ 1
 // CHECK-V91A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V92A %s
-// CHECK-V92A: #define __ARM_ARCH 9
+// CHECK-V92A: #define __ARM_ARCH 902
 // CHECK-V92A: #define __ARM_ARCH_9_2A__ 1
 // CHECK-V92A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.3a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V93A %s
-// CHECK-V93A: #define __ARM_ARCH 9
+// CHECK-V93A: #define __ARM_ARCH 903
 // CHECK-V93A: #define __ARM_ARCH_9_3A__ 1
 // CHECK-V93A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.4a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V94A %s
-// CHECK-V94A: #define __ARM_ARCH 9
+// CHECK-V94A: #define __ARM_ARCH 904
 // CHECK-V94A: #define __ARM_ARCH_9_4A__ 1
 // CHECK-V94A: #define __ARM_ARCH_PROFILE 'A'
 
 // RUN: %clang -target armv9.5a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V95A %s
-// CHECK-V95A: #define __ARM_ARCH 9
+// CHECK-V95A: #define __ARM_ARCH 905
 // CHECK-V95A: #define __ARM_ARCH_9_5A__ 1
 // CHECK-V95A: #define __ARM_ARCH_PROFILE 'A'
 
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 2bf1886951f1..47dbeca206a9 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -454,3 +454,43 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
   // expected-note@+1 {{add '__arm_preserves("za")' to the callee if it preserves ZA}}
   share_zt0_only();
 }
+
+// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
+// expected-error@+1 {{streaming function cannot be multi-versioned}}
+__attribute__((target_version("sme2")))
+void cannot_work_version(void) __arm_streaming {}
+// expected-cpp-error@+5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
+// expected-cpp-note@-2 {{previous declaration is here}}
+// expected-error@+3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
+// expected-note@-4 {{previous declaration is here}}
+__attribute__((target_version("default")))
+void cannot_work_version(void) {}
+
+
+// expected-cpp-error@+2 {{streaming function cannot be multi-versioned}}
+// expected-error@+1 {{streaming function cannot be multi-versioned}}
+__attribute__((target_clones("sme2")))
+void cannot_work_clones(void) __arm_streaming {}
+
+
+__attribute__((target("sme2")))
+void just_fine_streaming(void) __arm_streaming {}
+__attribute__((target_version("sme2")))
+void just_fine(void) { just_fine_streaming(); }
+__attribute__((target_version("default")))
+void just_fine(void) {}
+
+
+__arm_locally_streaming
+__attribute__((target_version("sme2")))
+void just_fine_locally_streaming(void) {}
+__attribute__((target_version("default")))
+void just_fine_locally_streaming(void) {}
+
+
+void fmv_caller() {
+    cannot_work_version();
+    cannot_work_clones();
+    just_fine();
+    just_fine_locally_streaming();
+}
diff --git a/clang/test/Sema/aarch64-tme-errors.c b/clang/test/Sema/aarch64-tme-errors.c
index 26e931b62bcb..1cb6f6903514 100644
--- a/clang/test/Sema/aarch64-tme-errors.c
+++ b/clang/test/Sema/aarch64-tme-errors.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -verify %s
+// RUN: %clang_cc1 -triple aarch64 -verify %s
 
 #include "arm_acle.h"
 
diff --git a/clang/test/Sema/aarch64-tme-tcancel-errors.c b/clang/test/Sema/aarch64-tme-tcancel-errors.c
index 4fcf6817f090..365bf81cab2d 100644
--- a/clang/test/Sema/aarch64-tme-tcancel-errors.c
+++ b/clang/test/Sema/aarch64-tme-tcancel-errors.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +tme -verify %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +tme -verify %s
 void t_cancel_const(unsigned short u) {
   __builtin_arm_tcancel(u); // expected-error{{argument to '__builtin_arm_tcancel' must be a constant integer}}
 }
 
-// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +tme -verify %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +tme -verify %s
 void t_cancel_range(void) {
   __builtin_arm_tcancel(0x12345u); // expected-error{{argument value 74565 is outside the valid range [0, 65535]}}
 }
diff --git a/clang/test/Sema/conversion-64-32.c b/clang/test/Sema/conversion-64-32.c
index dc417edcbc21..c172dd109f3b 100644
--- a/clang/test/Sema/conversion-64-32.c
+++ b/clang/test/Sema/conversion-64-32.c
@@ -9,9 +9,13 @@ typedef long long long2 __attribute__((__vector_size__(16)));
 
 int4 test1(long2 a) {
   int4  v127 = a;  // no warning.
-  return v127; 
+  return v127;
 }
 
 int test2(long v) {
   return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}}
 }
+
+char test3(short s) {
+  return s * 2; // no warning.
+}
diff --git a/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c
new file mode 100644
index 000000000000..e22ccbe821f6
--- /dev/null
+++ b/clang/test/Sema/conversion-implicit-int-includes-64-to-32.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -Wimplicit-int-conversion -triple x86_64-apple-darwin %s
+
+int test0(long v) {
+  return v; // expected-warning {{implicit conversion loses integer precision}}
+}
+
+typedef int  int4  __attribute__ ((vector_size(16)));
+typedef long long long2 __attribute__((__vector_size__(16)));
+
+int4 test1(long2 a) {
+  int4  v127 = a;  // no warning.
+  return v127;
+}
+
+int test2(long v) {
+  return v / 2; // expected-warning {{implicit conversion loses integer precision: 'long' to 'int'}}
+}
+
+char test3(short s) {
+  return s * 2; // expected-warning {{implicit conversion loses integer precision: 'int' to 'char'}}
+}
diff --git a/clang/test/Sema/ms_predefined_expr.cpp b/clang/test/Sema/ms_predefined_expr.cpp
index 9f4eb2763430..b42a494beef9 100644
--- a/clang/test/Sema/ms_predefined_expr.cpp
+++ b/clang/test/Sema/ms_predefined_expr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
+// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions -fexperimental-new-constant-interpreter
 
 using size_t = __SIZE_TYPE__;
 
diff --git a/clang/test/SemaCXX/PR40395.cpp b/clang/test/SemaCXX/PR40395.cpp
index 469c86d56209..ea0fad201877 100644
--- a/clang/test/SemaCXX/PR40395.cpp
+++ b/clang/test/SemaCXX/PR40395.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++17 -fms-extensions -triple=x86_64-pc-win32 -verify %s
+// RUN: %clang_cc1 -std=c++17 -fms-extensions -triple=x86_64-pc-win32 -verify %s -fexperimental-new-constant-interpreter
 // expected-no-diagnostics
 
 // PR40395 - ConstantExpr shouldn't cause the template object to infinitely
diff --git a/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp
new file mode 100644
index 000000000000..1901589ef732
--- /dev/null
+++ b/clang/test/SemaCXX/attr-target-mv-warn-unused.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -Wunused %s
+
+__attribute__((target("sse3")))
+static int not_used_fmv() { return 1; }
+__attribute__((target("avx2")))
+static int not_used_fmv() { return 2; }
+__attribute__((target("default")))
+static int not_used_fmv() { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
+
+__attribute__((target("sse3")))
+static int definitely_used_fmv() { return 1; }
+__attribute__((target("avx2")))
+static int definitely_used_fmv() { return 2; }
+__attribute__((target("default")))
+static int definitely_used_fmv() { return 0; }
+int definite_user() { return definitely_used_fmv(); }
diff --git a/clang/test/SemaCXX/compound-literal.cpp b/clang/test/SemaCXX/compound-literal.cpp
index 5957099de53a..a3d3b9faa9fe 100644
--- a/clang/test/SemaCXX/compound-literal.cpp
+++ b/clang/test/SemaCXX/compound-literal.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -ast-dump %s > %t-11
 // RUN: FileCheck --input-file=%t-11 %s
 // RUN: FileCheck --input-file=%t-11 %s --check-prefix=CHECK-CXX11
+// RUN: %clang_cc1 -verify -std=c++17 %s
 
 // http://llvm.org/PR7905
 namespace PR7905 {
@@ -108,3 +109,23 @@ int computed_with_lambda = [] {
   return result;
 }();
 #endif
+
+namespace DynamicFileScopeLiteral {
+// This covers the case where we have a file-scope compound literal with a
+// non-constant initializer in C++. Previously, we had a bug where Clang forgot
+// to consider initializer list elements for bases.
+struct Empty {};
+struct Foo : Empty { // expected-note 0+ {{candidate constructor}}
+  int x;
+  int y;
+};
+int f();
+#if __cplusplus < 201103L
+// expected-error@+6 {{non-aggregate type 'Foo' cannot be initialized with an initializer list}}
+#elif __cplusplus < 201703L
+// expected-error@+4 {{no matching constructor}}
+#else
+// expected-error@+2 {{initializer element is not a compile-time constant}}
+#endif
+Foo o = (Foo){ {}, 1, f() };
+}
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index 00a39f9f03b7..71e55c8290ee 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s -fexperimental-new-constant-interpreter
 
 template <typename Iterator> class normal_iterator {};
 
diff --git a/clang/test/SemaCXX/conditional-expr.cpp b/clang/test/SemaCXX/conditional-expr.cpp
index 9a5e2bac4341..01effaa18932 100644
--- a/clang/test/SemaCXX/conditional-expr.cpp
+++ b/clang/test/SemaCXX/conditional-expr.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx11 -std=c++11 -Wsign-conversion %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify=expected,expected-cxx17 -std=c++17 -Wsign-conversion %s -fexperimental-new-constant-interpreter
 
 // C++ rules for ?: are a lot stricter than C rules, and have to take into
 // account more conversion options.
diff --git a/clang/test/SemaCXX/crash-GH10518.cpp b/clang/test/SemaCXX/crash-GH10518.cpp
new file mode 100644
index 000000000000..6c5f80afd3cf
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH10518.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/10518
+
+template <class T>
+class A : public T {
+};
+
+template <class T>
+class B : public A<T> {
+};
+
+template <class T>
+class B<int> : public A<T> { // expected-error 0-1 {{}}
+	B(T *t) {}
+};
diff --git a/clang/test/SemaCXX/crash-GH49103-2.cpp b/clang/test/SemaCXX/crash-GH49103-2.cpp
new file mode 100644
index 000000000000..4c17a054c73a
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH49103-2.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/49103
+
+template<class> struct A; // expected-note 0+ {{}}
+struct S : __make_integer_seq<A, int, 42> { }; // expected-error 0+ {{}}
+S s;
diff --git a/clang/test/SemaCXX/crash-GH67914.cpp b/clang/test/SemaCXX/crash-GH67914.cpp
new file mode 100644
index 000000000000..fbaeac636c0d
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH67914.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/67914
+
+template < typename, int >
+struct Mask;
+
+template < int, class >
+struct conditional {
+  using type = Mask< int, 16 >; // expected-warning 0+ {{}}
+};
+
+template < class _Then >
+struct conditional< 0, _Then > {
+  using type = _Then; // expected-warning 0+ {{}}
+};
+
+template < int _Bp, class, class _Then >
+using conditional_t = typename conditional< _Bp, _Then >::type; // expected-warning 0+ {{}}
+
+template < typename, int >
+struct Array;
+
+template < typename, int, bool, typename >
+struct StaticArrayImpl;
+
+template < typename Value_, int Size_ >
+struct Mask : StaticArrayImpl< Value_, Size_, 1, Mask< Value_, Size_ > > { // expected-note 0+ {{}}
+  template < typename T1 >
+  Mask(T1) {} // expected-note 0+ {{}}
+};
+
+template < typename T >
+void load(typename T::MaskType mask) {
+  T::load_(mask); // expected-note 0+ {{}}
+}
+
+template < typename Value_, int IsMask_, typename Derived_ >
+struct StaticArrayImpl< Value_, 32, IsMask_, Derived_ > {
+  using Array1 = conditional_t< IsMask_, void, Array< Value_, 16 > >; // expected-warning 0+ {{}}
+  
+  template < typename Mask >
+  static Derived_ load_(Mask mask) {
+    return Derived_{load< Array1 >(mask.a1), Mask{}}; // expected-error 0+ {{}}
+  }
+
+  Array1 a1;
+};
+
+template < typename Derived_ >
+struct KMaskBase;
+
+template < typename Derived_ >
+struct StaticArrayImpl< float, 16, 0, Derived_ > {
+  template < typename Mask >
+  static Derived_ load_(Mask mask);
+};
+
+template < typename Derived_ >
+struct StaticArrayImpl< float, 16, 1, Mask< float, 16 > > : KMaskBase< Derived_ > {}; // expected-error 0+ {{}}
+
+template < typename Derived_ >
+struct StaticArrayImpl< int, 16, 1, Derived_ > {};
+
+template < typename Value_, int Size_ >
+struct Array : StaticArrayImpl< Value_, Size_, 0, Array< Value_, Size_ > > {
+  using MaskType = Mask< Value_, Size_ >; // expected-warning 0+ {{}}
+};
+
+void test11_load_masked() {
+  load< Array< float, 32 > >{} == 0; // expected-error 0+ {{}} expected-warning 0+ {{}} expected-note 0+ {{}}
+}
diff --git a/clang/test/SemaCXX/crash-GH78388.cpp b/clang/test/SemaCXX/crash-GH78388.cpp
new file mode 100644
index 000000000000..cdec4d5bedef
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH78388.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -std=c++98 %s
+// RUN: %clang_cc1 -verify -std=c++11 %s
+// RUN: %clang_cc1 -verify -std=c++14 %s
+// RUN: %clang_cc1 -verify -std=c++17 %s
+// RUN: %clang_cc1 -verify -std=c++20 %s
+// RUN: %clang_cc1 -verify -std=c++23 %s
+// RUN: %clang_cc1 -verify -std=c++2c %s
+
+// https://github.com/llvm/llvm-project/issues/78388
+
+typedef mbstate_t; // expected-error 0+ {{}} expected-note 0+ {{}}
+  template < typename , typename , typename >
+  class a // expected-error 0+ {{}}
+  class b { // expected-error 0+ {{}}
+    namespace { // expected-note 0+ {{}} expected-note 0+ {{}}
+    template < typename c > b::operator=() { // expected-error 0+ {{}} expected-note 0+ {{}}
+      struct :a< c, char, stdmbstate_t > d // expected-error 0+ {{}} expected-warning 0+ {{}}
diff --git a/clang/test/SemaCXX/expression-traits.cpp b/clang/test/SemaCXX/expression-traits.cpp
index a76f0c4a6175..64ddca091e94 100644
--- a/clang/test/SemaCXX/expression-traits.cpp
+++ b/clang/test/SemaCXX/expression-traits.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -fcxx-exceptions %s -fexperimental-new-constant-interpreter
 
 //
 // Tests for "expression traits" intrinsics such as __is_lvalue_expr.
diff --git a/clang/test/SemaCXX/ms-uuid.cpp b/clang/test/SemaCXX/ms-uuid.cpp
index 21f93ecc3fa2..172e036e15f3 100644
--- a/clang/test/SemaCXX/ms-uuid.cpp
+++ b/clang/test/SemaCXX/ms-uuid.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -fms-extensions %s -Wno-deprecated-declarations
+// RUN: %clang_cc1 -fsyntax-only -verify -fms-extensions %s -Wno-deprecated-declarations -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify -fms-extensions %s -Wno-deprecated-declarations
+// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify -fms-extensions %s -Wno-deprecated-declarations -fexperimental-new-constant-interpreter
 
 typedef struct _GUID {
   __UINT32_TYPE__ Data1;
diff --git a/clang/test/SemaCXX/self-comparison.cpp b/clang/test/SemaCXX/self-comparison.cpp
index 72127f110241..c3c875565ff1 100644
--- a/clang/test/SemaCXX/self-comparison.cpp
+++ b/clang/test/SemaCXX/self-comparison.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a -fexperimental-new-constant-interpreter
 
 int foo(int x) {
   return x == x; // expected-warning {{self-comparison always evaluates to true}}
diff --git a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
index 9f2386ddc3c6..800783f6d92b 100644
--- a/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
+++ b/clang/test/SemaCXX/warn-overaligned-type-thrown.cpp
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -triple arm64-apple-tvos12 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm64-apple-watchos5 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple arm-linux-androideabi -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnueabi -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
+// RUN: %clang_cc1 -triple aarch64 -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
 // RUN: %clang_cc1 -triple wasm32-unknown-unknown -verify -fsyntax-only -std=c++11 -fcxx-exceptions -fexceptions %s
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index bda6a65c0216..d54b394df4eb 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -D AVOID %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
-// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -Wshadow-all %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -D AVOID %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow -Wshadow-uncaptured-local %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx14 -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++17 -verify -fsyntax-only -Wshadow-all %s
 // RUN: %clang_cc1 -std=c++20 -verify -fsyntax-only -Wshadow-all %s
 
@@ -179,3 +179,89 @@ void f() {
 #endif
 }
 }
+
+namespace GH71976 {
+#ifdef AVOID
+struct A {
+  int b = 5;
+  int foo() {
+    return [b = b]() { return b; }(); // no -Wshadow diagnostic, init-capture does not shadow b due to not capturing this
+  }
+};
+
+struct B {
+  int a;
+  void foo() {
+    auto b = [a = this->a] {}; // no -Wshadow diagnostic, init-capture does not shadow a due to not capturing his
+  }
+};
+
+struct C {
+  int b = 5;
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // no -Wshadow diagnostic, init-capture does not shadow b due to outer lambda
+        return b;
+      }();
+    }();
+  }
+};
+
+#else
+struct A {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct B {
+  int a; // expected-note {{previous}}
+  void foo() {
+    auto b = [a = this->a] {}; // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct C {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [a = b]() {
+      return [=, b = a]() { // expected-warning {{declaration shadows a field}}
+        return b;
+      }();
+    }();
+  }
+};
+
+struct D {
+  int b = 5; // expected-note {{previous}}
+  int foo() {
+    return [b = b, this]() { return b; }(); // expected-warning {{declaration shadows a field}}
+  }
+};
+
+struct E {
+  int b = 5;
+  int foo() {
+    return [a = b]() { // expected-note {{previous}}
+      return [=, a = a]() { // expected-warning {{shadows a local}}
+        return a;
+      }();
+    }();
+  }
+};
+
+#endif
+
+struct S {
+    int a ;
+};
+
+int foo() {
+  auto [a] = S{0}; // expected-note {{previous}} \
+                   // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  [a = a] () { // expected-warning {{declaration shadows a structured binding}}
+  }();
+}
+
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
new file mode 100644
index 000000000000..90c11b1be95c
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++20 -Wno-all -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            -verify %s
+
+// CHECK-NOT: [-Wunsafe-buffer-usage]
+
+
+void foo(unsigned idx) {
+  int buffer[10];         // expected-warning{{'buffer' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'buffer' to 'std::array' to label it for hardening}}
+  buffer[idx] = 0;        // expected-note{{used in buffer access here}}
+}
+
+int global_buffer[10];    // expected-warning{{'global_buffer' is an unsafe buffer that does not perform bounds checks}}
+void foo2(unsigned idx) {
+  global_buffer[idx] = 0;        // expected-note{{used in buffer access here}}
+}
+
+struct Foo {
+  int member_buffer[10];
+};
+void foo2(Foo& f, unsigned idx) {
+  f.member_buffer[idx] = 0; // expected-warning{{unsafe buffer access}}
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
index 5fff0854d454..a5b578b98d4e 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-debug.cpp
@@ -32,15 +32,6 @@ void foo() {
                         // debug-note{{safe buffers debug: gadget 'ULCArraySubscript' refused to produce a fix}}
 }
 
-void failed_decl() {
-  int a[10];  // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}} \
-              // debug-note{{safe buffers debug: failed to produce fixit for declaration 'a' : not a pointer}}
-  
-  for (int i = 0; i < 10; i++) {
-    a[i] = i;  // expected-note{{used in buffer access here}}
-  }
-}
-
 void failed_multiple_decl() {
   int *a = new int[4], b;  // expected-warning{{'a' is an unsafe pointer used for buffer access}} \
                           // debug-note{{safe buffers debug: failed to produce fixit for declaration 'a' : multiple VarDecls}}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp
new file mode 100644
index 000000000000..3adfc324dfbe
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fixits-local-var-array.cpp
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+typedef int * Int_ptr_t;
+typedef int Int_t;
+
+void simple(unsigned idx) {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+  buffer[idx] = 0;
+}
+
+void array2d(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer[idx][idx] = 0;
+}
+
+void array2d_vla(unsigned sz, unsigned idx) {
+  int buffer1[10][sz];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  int buffer2[sz][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer1[idx][idx] = 0;
+  buffer2[idx][idx] = 0;
+}
+
+void array2d_assign_from_elem(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  int a = buffer[idx][idx];
+}
+
+void array2d_use(int *);
+void array2d_call(unsigned idx) {
+  int buffer[10][10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  array2d_use(buffer[idx]);
+}
+void array2d_call_vla(unsigned sz, unsigned idx) {
+  int buffer[10][sz];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  array2d_use(buffer[idx]);
+}
+
+void array2d_typedef(unsigned idx) {
+  typedef int ten_ints_t[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  ten_ints_t buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+  buffer[idx][idx] = 0;
+}
+
+void whitespace_in_declaration(unsigned idx) {
+  int      buffer_w   [       10 ];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:35}:"std::array<int, 10> buffer_w"
+  buffer_w[idx] = 0;
+}
+
+void comments_in_declaration(unsigned idx) {
+  int   /* [A] */   buffer_w  /* [B] */ [  /* [C] */ 10 /* [D] */  ] ;
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:69}:"std::array<int   /* [A] */, /* [C] */ 10 /* [D] */> buffer_w"
+  buffer_w[idx] = 0;
+}
+
+void initializer(unsigned idx) {
+  int buffer[3] = {0};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:16}:"std::array<int, 3> buffer"
+
+  int buffer2[3] = {0, 1, 2};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 3> buffer2"
+
+  buffer[idx] = 0;
+  buffer2[idx] = 0;
+}
+
+void auto_size(unsigned idx) {
+  int buffer[] = {0, 1, 2};
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void universal_initialization(unsigned idx) {
+  int buffer[] {0, 1, 2};
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void multi_decl1(unsigned idx) {
+  int a, buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void multi_decl2(unsigned idx) {
+  int buffer[10], b;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]
+// FIXME: implement support
+
+  buffer[idx] = 0;
+}
+
+void local_array_ptr_to_const(unsigned idx, const int*& a) {
+  const int * buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<const int *, 10> buffer"
+  a = buffer[idx];
+}
+
+void local_array_const_ptr(unsigned idx, int*& a) {
+  int * const buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<int * const, 10> buffer"
+
+  a = buffer[idx];
+}
+
+void local_array_const_ptr_via_typedef(unsigned idx, int*& a) {
+  typedef int * const my_const_ptr;
+  my_const_ptr buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:26}:"std::array<my_const_ptr, 10> buffer"
+
+  a = buffer[idx];
+}
+
+void local_array_const_ptr_to_const(unsigned idx, const int*& a) {
+  const int * const buffer[10] = {a};
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:31}:"std::array<const int * const, 10> buffer"
+
+  a = buffer[idx];
+
+}
+
+template<typename T>
+void unsupported_local_array_in_template(unsigned idx) {
+  T buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer[idx] = 0;
+}
+// Instantiate the template function to force its analysis.
+template void unsupported_local_array_in_template<int>(unsigned);
+
+typedef unsigned int my_uint;
+void typedef_as_elem_type(unsigned idx) {
+  my_uint buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:21}:"std::array<my_uint, 10> buffer"
+  buffer[idx] = 0;
+}
+
+void decltype_as_elem_type(unsigned idx) {
+  int a;
+  decltype(a) buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:25}:"std::array<decltype(a), 10> buffer"
+  buffer[idx] = 0;
+}
+
+void macro_as_elem_type(unsigned idx) {
+#define MY_INT int
+  MY_INT buffer[10];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+// FIXME: implement support
+
+  buffer[idx] = 0;
+#undef MY_INT
+}
+
+void macro_as_identifier(unsigned idx) {
+#define MY_BUFFER buffer
+  int MY_BUFFER[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:20}:"std::array<int, 10> MY_BUFFER"
+  MY_BUFFER[idx] = 0;
+#undef MY_BUFFER
+}
+
+void macro_as_size(unsigned idx) {
+#define MY_TEN 10
+  int buffer[MY_TEN];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:21}:"std::array<int, MY_TEN> buffer"
+  buffer[idx] = 0;
+#undef MY_TEN
+}
+
+typedef unsigned int my_array[42];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+void typedef_as_array_type(unsigned idx) {
+  my_array buffer;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer[idx] = 0;
+}
+
+void decltype_as_array_type(unsigned idx) {
+  int buffer[42];
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  decltype(buffer) buffer2;
+// CHECK-NOT: fix-it:"{{.*}}":{[[@LINE-1]]:.*-[[@LINE-1]]:.*}
+  buffer2[idx] = 0;
+}
+
+void constant_as_size(unsigned idx) {
+  const unsigned my_const = 10;
+  int buffer[my_const];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:23}:"std::array<int, my_const> buffer"
+  buffer[idx] = 0;
+}
+
+void subscript_negative() {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+
+  // For constant-size arrays any negative index will lead to buffer underflow.
+  // std::array::operator[] has unsigned parameter so the value will be casted to unsigned.
+  // This will very likely be buffer overflow but hardened std::array catch these at runtime.
+  buffer[-5] = 0;
+}
+
+void subscript_signed(int signed_idx) {
+  int buffer[10];
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:17}:"std::array<int, 10> buffer"
+
+  // For constant-size arrays any negative index will lead to buffer underflow.
+  // std::array::operator[] has unsigned parameter so the value will be casted to unsigned.
+  // This will very likely be buffer overflow but hardened std::array catches these at runtime.
+  buffer[signed_idx] = 0;
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
index c5f0a9ef9293..67cdf252d6a8 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage.cpp
@@ -61,6 +61,7 @@ void testArraySubscripts(int *p, int **pp) {
       );
 
     int a[10];          // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
+                        // expected-note@-1{{change type of 'a' to 'std::array' to label it for hardening}}
     int b[10][10];      // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
 
   foo(a[1], 1[a],   // expected-note2{{used in buffer access here}}
@@ -174,6 +175,7 @@ auto file_scope_lambda = [](int *ptr) {
 void testLambdaCapture() {
   int a[10];              // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
   int b[10];              // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'b' to 'std::array' to label it for hardening}}
   int c[10];
 
   auto Lam1 = [a]() {
@@ -191,7 +193,9 @@ void testLambdaCapture() {
 
 void testLambdaImplicitCapture() {
   int a[10];              // expected-warning{{'a' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'a' to 'std::array' to label it for hardening}}
   int b[10];              // expected-warning{{'b' is an unsafe buffer that does not perform bounds checks}}
+                          // expected-note@-1{{change type of 'b' to 'std::array' to label it for hardening}}
   
   auto Lam1 = [=]() {
     return a[1];           // expected-note{{used in buffer access here}}
@@ -344,6 +348,7 @@ template<typename T> void fArr(T t[]) {
   // expected-warning@-1{{'t' is an unsafe pointer used for buffer access}}
   foo(t[1]);    // expected-note{{used in buffer access here}}
   T ar[8];      // expected-warning{{'ar' is an unsafe buffer that does not perform bounds checks}}
+                // expected-note@-1{{change type of 'ar' to 'std::array' to label it for hardening}}
   foo(ar[5]);   // expected-note{{used in buffer access here}}
 }
 
diff --git a/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp
new file mode 100644
index 000000000000..8c21da5a139f
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unused-filescoped-fmv.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++98 %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify -Wunused -std=c++14 %s
+
+__attribute__((target_version("fp16")))
+static int not_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int not_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int not_used_fmv(void) { return 0; } // expected-warning {{unused function 'not_used_fmv'}}
+
+
+__attribute__((target_version("fp16")))
+static int definitely_used_fmv(void) { return 1; }
+__attribute__((target_version("fp16fml")))
+static int definitely_used_fmv(void) { return 2; }
+__attribute__((target_version("default")))
+static int definitely_used_fmv(void) { return 0; }
+int definite_user(void) { return definitely_used_fmv(); }
diff --git a/clang/test/SemaOpenCL/operators.cl b/clang/test/SemaOpenCL/operators.cl
index cf359acd5acb..76a7692a7105 100644
--- a/clang/test/SemaOpenCL/operators.cl
+++ b/clang/test/SemaOpenCL/operators.cl
@@ -118,6 +118,6 @@ kernel void pointer_ops(){
   bool b = !p;
   b = p==0;
   int i;
-  b = !&i;
+  b = !&i; // expected-warning {{address of 'i' will always evaluate to 'true'}}
   b = &i==(int *)1;
 }
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 5ee6092bb9bb..e122cea50f72 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -399,7 +399,8 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer {
 };
 
 // Returns true on error.
-static bool format(StringRef FileName, bool IsSTDIN) {
+static bool format(StringRef FileName) {
+  const bool IsSTDIN = FileName == "-";
   if (!OutputXML && Inplace && IsSTDIN) {
     errs() << "error: cannot use -i when reading from stdin.\n";
     return false;
@@ -545,24 +546,25 @@ static void PrintVersion(raw_ostream &OS) {
 }
 
 // Dump the configuration.
-static int dumpConfig(bool IsSTDIN) {
+static int dumpConfig() {
   std::unique_ptr<llvm::MemoryBuffer> Code;
-
-  // `FileNames` must have at least "-" in it even if no file was specified.
-  assert(!FileNames.empty());
-
-  // Read in the code in case the filename alone isn't enough to detect the
-  // language.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      MemoryBuffer::getFileOrSTDIN(FileNames[0]);
-  if (std::error_code EC = CodeOrErr.getError()) {
-    llvm::errs() << EC.message() << "\n";
-    return 1;
+  // We can't read the code to detect the language if there's no file name.
+  if (!FileNames.empty()) {
+    // Read in the code in case the filename alone isn't enough to detect the
+    // language.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+        MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+    if (std::error_code EC = CodeOrErr.getError()) {
+      llvm::errs() << EC.message() << "\n";
+      return 1;
+    }
+    Code = std::move(CodeOrErr.get());
   }
-  Code = std::move(CodeOrErr.get());
-
   llvm::Expected<clang::format::FormatStyle> FormatStyle =
-      clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0],
+      clang::format::getStyle(Style,
+                              FileNames.empty() || FileNames[0] == "-"
+                                  ? AssumeFileName
+                                  : FileNames[0],
                               FallbackStyle, Code ? Code->getBuffer() : "");
   if (!FormatStyle) {
     llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n";
@@ -682,11 +684,8 @@ int main(int argc, const char **argv) {
     return 0;
   }
 
-  if (FileNames.empty())
-    FileNames.push_back("-");
-
   if (DumpConfig)
-    return dumpConfig(FileNames[0] == "-");
+    return dumpConfig();
 
   if (!Files.empty()) {
     std::ifstream ExternalFileOfFiles{std::string(Files)};
@@ -699,7 +698,10 @@ int main(int argc, const char **argv) {
     errs() << "Clang-formating " << LineNo << " files\n";
   }
 
-  if (FileNames.size() != 1 &&
+  if (FileNames.empty())
+    return clang::format::format("-");
+
+  if (FileNames.size() > 1 &&
       (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) {
     errs() << "error: -offset, -length and -lines can only be used for "
               "single file.\n";
@@ -709,14 +711,13 @@ int main(int argc, const char **argv) {
   unsigned FileNo = 1;
   bool Error = false;
   for (const auto &FileName : FileNames) {
-    const bool IsSTDIN = FileName == "-";
-    if (!IsSTDIN && isIgnored(FileName))
+    if (isIgnored(FileName))
       continue;
     if (Verbose) {
       errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] "
              << FileName << "\n";
     }
-    Error |= clang::format::format(FileName, IsSTDIN);
+    Error |= clang::format::format(FileName);
   }
   return Error ? 1 : 0;
 }
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index e9d2c6aad371..b5c6be3c557b 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -78,64 +78,6 @@ static void LLVMErrorHandler(void *UserData, const char *Message,
 }
 
 #ifdef CLANG_HAVE_RLIMITS
-#if defined(__linux__) && defined(__PIE__)
-static size_t getCurrentStackAllocation() {
-  // If we can't compute the current stack usage, allow for 512K of command
-  // line arguments and environment.
-  size_t Usage = 512 * 1024;
-  if (FILE *StatFile = fopen("/proc/self/stat", "r")) {
-    // We assume that the stack extends from its current address to the end of
-    // the environment space. In reality, there is another string literal (the
-    // program name) after the environment, but this is close enough (we only
-    // need to be within 100K or so).
-    unsigned long StackPtr, EnvEnd;
-    // Disable silly GCC -Wformat warning that complains about length
-    // modifiers on ignored format specifiers. We want to retain these
-    // for documentation purposes even though they have no effect.
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wformat"
-#endif
-    if (fscanf(StatFile,
-               "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*lu %*lu %*lu %*lu %*lu "
-               "%*lu %*ld %*ld %*ld %*ld %*ld %*ld %*llu %*lu %*ld %*lu %*lu "
-               "%*lu %*lu %lu %*lu %*lu %*lu %*lu %*lu %*llu %*lu %*lu %*d %*d "
-               "%*u %*u %*llu %*lu %*ld %*lu %*lu %*lu %*lu %*lu %*lu %lu %*d",
-               &StackPtr, &EnvEnd) == 2) {
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-      Usage = StackPtr < EnvEnd ? EnvEnd - StackPtr : StackPtr - EnvEnd;
-    }
-    fclose(StatFile);
-  }
-  return Usage;
-}
-
-#include <alloca.h>
-
-LLVM_ATTRIBUTE_NOINLINE
-static void ensureStackAddressSpace() {
-  // Linux kernels prior to 4.1 will sometimes locate the heap of a PIE binary
-  // relatively close to the stack (they are only guaranteed to be 128MiB
-  // apart). This results in crashes if we happen to heap-allocate more than
-  // 128MiB before we reach our stack high-water mark.
-  //
-  // To avoid these crashes, ensure that we have sufficient virtual memory
-  // pages allocated before we start running.
-  size_t Curr = getCurrentStackAllocation();
-  const int kTargetStack = DesiredStackSize - 256 * 1024;
-  if (Curr < kTargetStack) {
-    volatile char *volatile Alloc =
-        static_cast<volatile char *>(alloca(kTargetStack - Curr));
-    Alloc[0] = 0;
-    Alloc[kTargetStack - Curr - 1] = 0;
-  }
-}
-#else
-static void ensureStackAddressSpace() {}
-#endif
-
 /// Attempt to ensure that we have at least 8MiB of usable stack space.
 static void ensureSufficientStack() {
   struct rlimit rlim;
@@ -159,10 +101,6 @@ static void ensureSufficientStack() {
         rlim.rlim_cur != DesiredStackSize)
       return;
   }
-
-  // We should now have a stack of size at least DesiredStackSize. Ensure
-  // that we can actually use that much, if necessary.
-  ensureStackAddressSpace();
 }
 #else
 static void ensureSufficientStack() {}
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index bc398fa0731f..a55e06500d9d 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -89,10 +89,15 @@ struct AssemblerInvocation {
   /// @{
 
   std::vector<std::string> IncludePaths;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoInitialTextSection : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned SaveTemporaryLabels : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned GenDwarfForAssembly : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RelaxELFRelocations : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned Dwarf64 : 1;
   unsigned DwarfVersion;
   std::string DwarfDebugFlags;
@@ -117,7 +122,9 @@ struct AssemblerInvocation {
     FT_Obj   ///< Object file output.
   };
   FileType OutputType;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowHelp : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowVersion : 1;
 
   /// @}
@@ -125,19 +132,28 @@ struct AssemblerInvocation {
   /// @{
 
   unsigned OutputAsmVariant;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowEncoding : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned ShowInst : 1;
 
   /// @}
   /// @name Assembler Options
   /// @{
 
+  LLVM_PREFERRED_TYPE(bool)
   unsigned RelaxAll : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoExecStack : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned FatalWarnings : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoWarn : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned NoTypeCheck : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned IncrementalLinkerCompatible : 1;
+  LLVM_PREFERRED_TYPE(bool)
   unsigned EmbedBitcode : 1;
 
   /// Whether to emit DWARF unwind info.
@@ -145,6 +161,7 @@ struct AssemblerInvocation {
 
   // Whether to emit compact-unwind for non-canonical entries.
   // Note: maybe overriden by other constraints.
+  LLVM_PREFERRED_TYPE(bool)
   unsigned EmitCompactUnwindNonCanonical : 1;
 
   /// The name of the relocation model to use.
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index e5c0971996e0..4ded92cbe9ae 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -6114,6 +6114,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("attribute(aligned)");
   case CXCursor_ConceptDecl:
     return cxstring::createRef("ConceptDecl");
+  case CXCursor_OpenACCComputeConstruct:
+    return cxstring::createRef("OpenACCComputeConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 01b8a23f6eac..454bf7549861 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -870,6 +870,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPParallelGenericLoopDirective;
     break;
+  case Stmt::OpenACCComputeConstructClass:
+    K = CXCursor_OpenACCComputeConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
diff --git a/clang/tools/libclang/Indexing.cpp b/clang/tools/libclang/Indexing.cpp
index 17d393ef8084..05d88452209f 100644
--- a/clang/tools/libclang/Indexing.cpp
+++ b/clang/tools/libclang/Indexing.cpp
@@ -261,12 +261,13 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     bool isImport = (IncludeTok.is(tok::identifier) &&
             IncludeTok.getIdentifierInfo()->getPPKeywordID() == tok::pp_import);
     DataConsumer.ppIncludedFile(HashLoc, FileName, File, isImport, IsAngled,
-                            Imported);
+                                ModuleImported);
   }
 
   /// MacroDefined - This hook is called whenever a macro definition is seen.
diff --git a/clang/tools/scan-build/man/scan-build.1 b/clang/tools/scan-build/man/scan-build.1
index 29edbca1fc63..e2b37f6062db 100644
--- a/clang/tools/scan-build/man/scan-build.1
+++ b/clang/tools/scan-build/man/scan-build.1
@@ -2,9 +2,9 @@
 .\" See https://llvm.org/LICENSE.txt for license information.
 .\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 .\" $Id$
-.Dd Sep 21, 2023
+.Dd Feb 10, 2024
 .Dt SCAN-BUILD 1
-.Os "clang" "18"
+.Os "clang" "19"
 .Sh NAME
 .Nm scan-build
 .Nd Clang static analyzer
diff --git a/clang/unittests/AST/ASTExprTest.cpp b/clang/unittests/AST/ASTExprTest.cpp
index ec75492ccff8..5ec6aea8edba 100644
--- a/clang/unittests/AST/ASTExprTest.cpp
+++ b/clang/unittests/AST/ASTExprTest.cpp
@@ -20,17 +20,37 @@
 
 using namespace clang;
 
+using clang::ast_matchers::cxxRecordDecl;
+using clang::ast_matchers::hasName;
+using clang::ast_matchers::match;
+using clang::ast_matchers::varDecl;
+using clang::tooling::buildASTFromCode;
+
+static IntegerLiteral *createIntLiteral(ASTContext &Ctx, uint32_t Value) {
+  const int numBits = 32;
+  return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value), Ctx.IntTy,
+                                {});
+}
+
+const CXXRecordDecl *getCXXRecordDeclNode(ASTUnit *AST,
+                                          const std::string &Name) {
+  auto Result =
+      match(cxxRecordDecl(hasName(Name)).bind("record"), AST->getASTContext());
+  EXPECT_FALSE(Result.empty());
+  return Result[0].getNodeAs<CXXRecordDecl>("record");
+}
+
+const VarDecl *getVariableNode(ASTUnit *AST, const std::string &Name) {
+  auto Result = match(varDecl(hasName(Name)).bind("var"), AST->getASTContext());
+  EXPECT_EQ(Result.size(), 1u);
+  return Result[0].getNodeAs<VarDecl>("var");
+}
+
 TEST(ASTExpr, IgnoreExprCallbackForwarded) {
   constexpr char Code[] = "";
   auto AST = tooling::buildASTFromCodeWithArgs(Code, /*Args=*/{"-std=c++20"});
   ASTContext &Ctx = AST->getASTContext();
 
-  auto createIntLiteral = [&](uint32_t Value) -> IntegerLiteral * {
-    const int numBits = 32;
-    return IntegerLiteral::Create(Ctx, llvm::APInt(numBits, Value),
-                                  Ctx.UnsignedIntTy, {});
-  };
-
   struct IgnoreParens {
     Expr *operator()(Expr *E) & { return nullptr; }
     Expr *operator()(Expr *E) && {
@@ -42,7 +62,7 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) {
   };
 
   {
-    auto *IntExpr = createIntLiteral(10);
+    auto *IntExpr = createIntLiteral(Ctx, 10);
     ParenExpr *PE =
         new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr);
     EXPECT_EQ(IntExpr, IgnoreExprNodes(PE, IgnoreParens{}));
@@ -50,9 +70,41 @@ TEST(ASTExpr, IgnoreExprCallbackForwarded) {
 
   {
     IgnoreParens CB{};
-    auto *IntExpr = createIntLiteral(10);
+    auto *IntExpr = createIntLiteral(Ctx, 10);
     ParenExpr *PE =
         new (Ctx) ParenExpr(SourceLocation{}, SourceLocation{}, IntExpr);
     EXPECT_EQ(nullptr, IgnoreExprNodes(PE, CB));
   }
 }
+
+TEST(ASTExpr, InitListIsConstantInitialized) {
+  auto AST = buildASTFromCode(R"cpp(
+    struct Empty {};
+    struct Foo : Empty { int x, y; };
+    int gv;
+  )cpp");
+  ASTContext &Ctx = AST->getASTContext();
+  const CXXRecordDecl *Empty = getCXXRecordDeclNode(AST.get(), "Empty");
+  const CXXRecordDecl *Foo = getCXXRecordDeclNode(AST.get(), "Foo");
+
+  SourceLocation Loc{};
+  InitListExpr *BaseInit = new (Ctx) InitListExpr(Ctx, Loc, {}, Loc);
+  BaseInit->setType(Ctx.getRecordType(Empty));
+  Expr *Exprs[3] = {
+      BaseInit,
+      createIntLiteral(Ctx, 13),
+      createIntLiteral(Ctx, 42),
+  };
+  InitListExpr *FooInit = new (Ctx) InitListExpr(Ctx, Loc, Exprs, Loc);
+  FooInit->setType(Ctx.getRecordType(Foo));
+  EXPECT_TRUE(FooInit->isConstantInitializer(Ctx, false));
+
+  // Replace the last initializer with something non-constant and make sure
+  // this returns false. Previously we had a bug where we didn't count base
+  // initializers, and only iterated over fields.
+  const VarDecl *GV = getVariableNode(AST.get(), "gv");
+  auto *Ref = new (Ctx) DeclRefExpr(Ctx, const_cast<VarDecl *>(GV), false,
+                                    Ctx.IntTy, VK_LValue, Loc);
+  (void)FooInit->updateInit(Ctx, 2, Ref);
+  EXPECT_FALSE(FooInit->isConstantInitializer(Ctx, false));
+}
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 8bbb04024dcc..4b3b3511f848 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2093,7 +2093,7 @@ TEST(TransferTest, TemporaryObject) {
 
 TEST(TransferTest, ElidableConstructor) {
   // This test is effectively the same as TransferTest.TemporaryObject, but
-  // the code is compiled as C++ 14.
+  // the code is compiled as C++14.
   std::string Code = R"(
     struct A {
       int Bar;
@@ -2313,6 +2313,42 @@ TEST(TransferTest, AssignmentOperatorWithInitAndInheritance) {
          ASTContext &ASTCtx) {});
 }
 
+TEST(TransferTest, AssignmentOperatorReturnsVoid) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct S {
+      void operator=(S&& other);
+    };
+    void target() {
+      S s;
+      s = S();
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
+TEST(TransferTest, AssignmentOperatorReturnsByValue) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct S {
+      S operator=(S&& other);
+    };
+    void target() {
+      S s;
+      s = S();
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
 TEST(TransferTest, CopyConstructor) {
   std::string Code = R"(
     struct A {
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index 73fb4063d92b..b6e4973fd7cb 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -770,12 +770,17 @@ constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs);
 
 template <typename T>
 constexpr bool operator==(const optional<T> &opt, nullopt_t);
+
+// C++20 and later do not define the following overloads because they are
+// provided by rewritten candidates instead.
+#if __cplusplus < 202002L
 template <typename T>
 constexpr bool operator==(nullopt_t, const optional<T> &opt);
 template <typename T>
 constexpr bool operator!=(const optional<T> &opt, nullopt_t);
 template <typename T>
 constexpr bool operator!=(nullopt_t, const optional<T> &opt);
+#endif  // __cplusplus < 202002L
 
 template <typename T, typename U>
 constexpr bool operator==(const optional<T> &opt, const U &value);
@@ -1289,6 +1294,15 @@ protected:
   template <typename FuncDeclMatcher>
   void ExpectDiagnosticsFor(std::string SourceCode,
                             FuncDeclMatcher FuncMatcher) {
+    // Run in C++17 and C++20 mode to cover differences in the AST between modes
+    // (e.g. C++20 can contain `CXXRewrittenBinaryOperator`).
+    for (const char *CxxMode : {"-std=c++17", "-std=c++20"})
+      ExpectDiagnosticsFor(SourceCode, FuncMatcher, CxxMode);
+  }
+
+  template <typename FuncDeclMatcher>
+  void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher,
+                            const char *CxxMode) {
     ReplaceAllOccurrences(SourceCode, "$ns", GetParam().NamespaceName);
     ReplaceAllOccurrences(SourceCode, "$optional", GetParam().TypeName);
 
@@ -1332,7 +1346,7 @@ protected:
                   llvm::move(EltDiagnostics, std::back_inserter(Diagnostics));
                 })
             .withASTBuildArgs(
-                {"-fsyntax-only", "-std=c++17", "-Wno-undefined-inline"})
+                {"-fsyntax-only", CxxMode, "-Wno-undefined-inline"})
             .withASTBuildVirtualMappedFiles(
                 tooling::FileContentMappings(Headers.begin(), Headers.end())),
         /*VerifyResults=*/[&Diagnostics](
diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp
index 557281499998..45840f5188cd 100644
--- a/clang/unittests/Basic/SourceManagerTest.cpp
+++ b/clang/unittests/Basic/SourceManagerTest.cpp
@@ -530,6 +530,7 @@ struct MacroAction {
 
   SourceLocation Loc;
   std::string Name;
+  LLVM_PREFERRED_TYPE(Kind)
   unsigned MAKind : 3;
 
   MacroAction(SourceLocation Loc, StringRef Name, unsigned K)
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 7493b0a4450f..ee8a55680753 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -678,6 +678,22 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               BraceWrapping.AfterControlStatement, FormatStyle::BWACS_Never);
 
   Style.AlwaysBreakAfterReturnType = FormatStyle::RTBS_All;
+  CHECK_PARSE("BreakAfterReturnType: None", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_None);
+  CHECK_PARSE("BreakAfterReturnType: Automatic", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_Automatic);
+  CHECK_PARSE("BreakAfterReturnType: ExceptShortType",
+              AlwaysBreakAfterReturnType, FormatStyle::RTBS_ExceptShortType);
+  CHECK_PARSE("BreakAfterReturnType: All", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_All);
+  CHECK_PARSE("BreakAfterReturnType: TopLevel", AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_TopLevel);
+  CHECK_PARSE("BreakAfterReturnType: AllDefinitions",
+              AlwaysBreakAfterReturnType, FormatStyle::RTBS_AllDefinitions);
+  CHECK_PARSE("BreakAfterReturnType: TopLevelDefinitions",
+              AlwaysBreakAfterReturnType,
+              FormatStyle::RTBS_TopLevelDefinitions);
+  // For backward compatibility:
   CHECK_PARSE("AlwaysBreakAfterReturnType: None", AlwaysBreakAfterReturnType,
               FormatStyle::RTBS_None);
   CHECK_PARSE("AlwaysBreakAfterReturnType: Automatic",
@@ -694,19 +710,32 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               AlwaysBreakAfterReturnType,
               FormatStyle::RTBS_TopLevelDefinitions);
 
-  Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  CHECK_PARSE("BreakTemplateDeclarations: Leave", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Leave);
+  CHECK_PARSE("BreakTemplateDeclarations: No", BreakTemplateDeclarations,
+              FormatStyle::BTDS_No);
+  CHECK_PARSE("BreakTemplateDeclarations: MultiLine", BreakTemplateDeclarations,
+              FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: Yes", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
+  CHECK_PARSE("BreakTemplateDeclarations: false", BreakTemplateDeclarations,
+              FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("BreakTemplateDeclarations: true", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
+  // For backward compatibility:
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: Leave",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Leave);
-  CHECK_PARSE("AlwaysBreakTemplateDeclarations: No",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_No);
+              BreakTemplateDeclarations, FormatStyle::BTDS_Leave);
+  CHECK_PARSE("AlwaysBreakTemplateDeclarations: No", BreakTemplateDeclarations,
+              FormatStyle::BTDS_No);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: MultiLine",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
-  CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
+              BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+  CHECK_PARSE("AlwaysBreakTemplateDeclarations: Yes", BreakTemplateDeclarations,
+              FormatStyle::BTDS_Yes);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: false",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
+              BreakTemplateDeclarations, FormatStyle::BTDS_MultiLine);
   CHECK_PARSE("AlwaysBreakTemplateDeclarations: true",
-              AlwaysBreakTemplateDeclarations, FormatStyle::BTDS_Yes);
+              BreakTemplateDeclarations, FormatStyle::BTDS_Yes);
 
   Style.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_All;
   CHECK_PARSE("AlwaysBreakAfterDefinitionReturnType: None",
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b1a2247bb85d..13937a15fdae 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10638,7 +10638,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                "    const typename aaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaa);");
 
   FormatStyle AlwaysBreak = getLLVMStyle();
-  AlwaysBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  AlwaysBreak.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   verifyFormat("template <typename T>\nclass C {};", AlwaysBreak);
   verifyFormat("template <typename T>\nvoid f();", AlwaysBreak);
   verifyFormat("template <typename T>\nvoid f() {}", AlwaysBreak);
@@ -10667,7 +10667,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                "};");
 
   FormatStyle NeverBreak = getLLVMStyle();
-  NeverBreak.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_No;
+  NeverBreak.BreakTemplateDeclarations = FormatStyle::BTDS_No;
   verifyFormat("template <typename T> class C {};", NeverBreak);
   verifyFormat("template <typename T> void f();", NeverBreak);
   verifyFormat("template <typename T> void f() {}", NeverBreak);
@@ -10699,7 +10699,7 @@ TEST_F(FormatTest, WrapsTemplateDeclarations) {
                NeverBreak);
 
   auto Style = getLLVMStyle();
-  Style.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Leave;
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Leave;
 
   verifyNoChange("template <typename T>\n"
                  "class C {};",
@@ -11297,7 +11297,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) {
   verifyFormat("SomeType MemberFunction( const Deleted & ) &;", Spaces);
 
   FormatStyle BreakTemplate = getLLVMStyle();
-  BreakTemplate.AlwaysBreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  BreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
 
   verifyFormat("struct f {\n"
                "  template <class T>\n"
@@ -11330,8 +11330,7 @@ TEST_F(FormatTest, UnderstandsFunctionRefQualification) {
                BreakTemplate);
 
   FormatStyle AlignLeftBreakTemplate = getLLVMStyle();
-  AlignLeftBreakTemplate.AlwaysBreakTemplateDeclarations =
-      FormatStyle::BTDS_Yes;
+  AlignLeftBreakTemplate.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   AlignLeftBreakTemplate.PointerAlignment = FormatStyle::PAS_Left;
 
   verifyFormat("struct f {\n"
@@ -26973,6 +26972,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   EXPECT_EQ(Style.RemoveParentheses, FormatStyle::RPS_Leave);
 
   Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses;
+  verifyFormat("#define Foo(...) foo((__VA_ARGS__))", Style);
   verifyFormat("int x __attribute__((aligned(16))) = 0;", Style);
   verifyFormat("decltype((foo->bar)) baz;", Style);
   verifyFormat("class __declspec(dllimport) X {};",
@@ -27007,6 +27007,7 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style);
 
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
+  verifyFormat("#define Return0 return (0);", Style);
   verifyFormat("return 0;", "return (0);", Style);
   verifyFormat("co_return 0;", "co_return ((0));", Style);
   verifyFormat("return 0;", "return (((0)));", Style);
diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index 324366ca7f5e..0aa755acfc82 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1055,6 +1055,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
 
   auto Tokens = annotate(
       "const static inline auto restrict int double long constexpr friend");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
 
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       Tokens[0], ConfiguredTokens));
@@ -1089,6 +1090,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[9]));
 
   auto NotTokens = annotate("for while do Foo Bar ");
+  ASSERT_EQ(NotTokens.size(), 6u) << Tokens;
 
   EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType(
       NotTokens[0], ConfiguredTokens));
@@ -1120,6 +1122,7 @@ TEST_F(QualifierFixerTest, IsQualifierType) {
 TEST_F(QualifierFixerTest, IsMacro) {
 
   auto Tokens = annotate("INT INTPR Foo int");
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
 
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[0]));
   EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isPossibleMacro(Tokens[1]));
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 52a00c8a1a35..3b36e4072281 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2287,6 +2287,51 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenBangOperator);
   Tokens = Annotate("!cond");
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenCondOperator);
+
+  auto AnnotateValue = [this, &Style](llvm::StringRef Code) {
+    // Values are annotated only in specific context.
+    auto Result = annotate(("def X { let V = " + Code + "; }").str(), Style);
+    return decltype(Result){Result.begin() + 6, Result.end() - 3};
+  };
+  // Both of bang/cond operators.
+  Tokens = AnnotateValue("!cond(!eq(x, 0): 1, true: x)");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TableGenCondOperator);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_TableGenBangOperator);
+  EXPECT_TOKEN(Tokens[8], tok::colon, TT_TableGenCondOperatorColon);
+  EXPECT_TOKEN(Tokens[10], tok::comma, TT_TableGenCondOperatorComma);
+  EXPECT_TOKEN(Tokens[12], tok::colon, TT_TableGenCondOperatorColon);
+  // DAGArg values with operator identifier
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpener);
+  EXPECT_TOKEN(Tokens[3], tok::colon, TT_TableGenDAGArgListColon);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); // $src1
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListComma);
+  EXPECT_TOKEN(Tokens[7], tok::colon, TT_TableGenDAGArgListColon);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+  // List literal
+  Tokens = AnnotateValue("[1, 2, 3]");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_TableGenListOpener);
+  EXPECT_TOKEN(Tokens[6], tok::r_square, TT_TableGenListCloser);
+  // Suffixes of values
+  Tokens = AnnotateValue("valid.field");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::period, TT_TableGenValueSuffix);
+  // Code
+  Tokens = AnnotateValue("[{ code is multiline string }]");
+  ASSERT_EQ(Tokens.size(), 1u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::string_literal, TT_TableGenMultiLineString);
+
+  // The definition
+  Tokens = annotate("def Def : Parent<Child> {}", Style);
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens; // This contains eof.
+  // We use inheritance colon and function brace. They are enough.
+  EXPECT_TOKEN(Tokens[2], tok::colon, TT_InheritanceColon);
+  EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index bc4eee73c1c2..59fef9ecbb9c 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -583,7 +583,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, UnderscorePragma) {
       R"(_Pragma(u"clang module import"))", Out));
   EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
 
-  // FIXME: R"()" strings depend on using C++ 11 language mode
+  // FIXME: R"()" strings depend on using C++11 language mode
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
       R"(_Pragma(R"abc(clang module import)abc"))", Out));
   EXPECT_STREQ("<TokBeforeEOF>\n", Out.data());
diff --git a/clang/unittests/Lex/PPCallbacksTest.cpp b/clang/unittests/Lex/PPCallbacksTest.cpp
index e0a27b511182..f3cdb1dfb287 100644
--- a/clang/unittests/Lex/PPCallbacksTest.cpp
+++ b/clang/unittests/Lex/PPCallbacksTest.cpp
@@ -37,7 +37,8 @@ public:
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
                           OptionalFileEntryRef File, StringRef SearchPath,
-                          StringRef RelativePath, const Module *Imported,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
                           SrcMgr::CharacteristicKind FileType) override {
     this->HashLoc = HashLoc;
     this->IncludeTok = IncludeTok;
@@ -47,7 +48,8 @@ public:
     this->File = File;
     this->SearchPath = SearchPath.str();
     this->RelativePath = RelativePath.str();
-    this->Imported = Imported;
+    this->SuggestedModule = SuggestedModule;
+    this->ModuleImported = ModuleImported;
     this->FileType = FileType;
   }
 
@@ -59,7 +61,8 @@ public:
   OptionalFileEntryRef File;
   SmallString<16> SearchPath;
   SmallString<16> RelativePath;
-  const Module* Imported;
+  const Module *SuggestedModule;
+  bool ModuleImported;
   SrcMgr::CharacteristicKind FileType;
 };
 
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index dc10fa14c595..48f55b8af97e 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -219,7 +219,7 @@ void EmitBuiltinDef(llvm::raw_ostream &OS, StringRef Substitution,
     break;
   }
   case BuiltinType::TargetBuiltin:
-    OS << ", \"\"";
+    OS << ", \"" << Builtin->getValueAsString("Features") << "\"";
     break;
   case BuiltinType::AtomicBuiltin:
   case BuiltinType::Builtin:
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 9f6ed39f0130..8513174c88bf 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -67,7 +67,9 @@ struct SemaRecord {
   bool HasMaskPolicy : 1;
   bool HasFRMRoundModeOp : 1;
   bool IsTuple : 1;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t UnMaskedPolicyScheme : 2;
+  LLVM_PREFERRED_TYPE(PolicyScheme)
   uint8_t MaskedPolicyScheme : 2;
 };
 
diff --git a/clang/www/analyzer/alpha_checks.html b/clang/www/analyzer/alpha_checks.html
index 11ef7d405dd4..7bbe4a20288f 100644
--- a/clang/www/analyzer/alpha_checks.html
+++ b/clang/www/analyzer/alpha_checks.html
@@ -87,29 +87,6 @@ void test() {
 </pre></div></div></td></tr>
 
 
-<tr><td><a id="alpha.core.CallAndMessageUnInitRefArg"><div class="namedescr expandable"><span class="name">
-alpha.core.CallAndMessageUnInitRefArg</span><span class="lang">
-(C, C++)</span><div class="descr">
-Check for uninitialized arguments in function calls and Objective-C
-message expressions.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(void) {
-  int t;
-  int &p = t;
-  int &s = p;
-  int &q = s;
-  foo(q); // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test(void) {
-  int x;
-  foo(&x); // warn
-}
-</pre></div></div></td></tr>
-
-
 <tr><td><a id="alpha.core.CastSize"><div class="namedescr expandable"><span class="name">
 alpha.core.CastSize</span><span class="lang">
 (C)</span><div class="descr">
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 3e13a4d89ef9..1e527e1d1736 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -782,7 +782,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/124.html">124</a></td>
     <td>CD1</td>
     <td>Lifetime of temporaries in default initialization of class arrays</td>
-    <td class="unknown" align="center">Duplicate of <a href="#201">201</a></td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="125">
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
@@ -1148,7 +1148,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
     <td>TC1</td>
     <td>"Named" temporaries and copy elision</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="186">
     <td><a href="https://cplusplus.github.io/CWG/issues/186.html">186</a></td>
@@ -1196,7 +1196,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
     <td>TC1</td>
     <td>Order of destruction of local automatics of destructor</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="194">
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
@@ -1232,7 +1232,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
     <td>CD1</td>
     <td>Order of destruction of temporaries</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="200">
     <td><a href="https://cplusplus.github.io/CWG/issues/200.html">200</a></td>
@@ -1244,7 +1244,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/201.html">201</a></td>
     <td>CD1</td>
     <td>Order of destruction of temporaries in initializers</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="202">
     <td><a href="https://cplusplus.github.io/CWG/issues/202.html">202</a></td>
@@ -1299,7 +1299,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/210.html">210</a></td>
     <td>TC1</td>
     <td>What is the type matched by an exception handler?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="211">
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
@@ -1792,7 +1792,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/292.html">292</a></td>
     <td>CD3</td>
     <td>Deallocation on exception in <TT>new</TT> before arguments evaluated</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr class="open" id="293">
     <td><a href="https://cplusplus.github.io/CWG/issues/293.html">293</a></td>
@@ -6708,7 +6708,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1150.html">1150</a></td>
     <td>NAD</td>
     <td>Inheriting constructors have not been implemented</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1151">
     <td><a href="https://cplusplus.github.io/CWG/issues/1151.html">1151</a></td>
@@ -8730,7 +8730,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1487.html">1487</a></td>
     <td>CD3</td>
     <td>When are inheriting constructors declared?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="1488">
     <td><a href="https://cplusplus.github.io/CWG/issues/1488.html">1488</a></td>
@@ -9210,7 +9210,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1567.html">1567</a></td>
     <td>C++14</td>
     <td>Inheriting constructors and copy/move constructors</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1568">
     <td><a href="https://cplusplus.github.io/CWG/issues/1568.html">1568</a></td>
@@ -10236,7 +10236,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1738.html">1738</a></td>
     <td>C++14</td>
     <td>Explicit instantiation/specialization of inheriting constructor templates</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="na" align="center">Superseded by <a href="https://wg21.link/P0136R1">P0136R1</a></td>
   </tr>
   <tr id="1739">
     <td><a href="https://cplusplus.github.io/CWG/issues/1739.html">1739</a></td>
@@ -13446,7 +13446,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2273.html">2273</a></td>
     <td>CD5</td>
     <td>Inheriting constructors vs implicit default constructor</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="2274">
     <td><a href="https://cplusplus.github.io/CWG/issues/2274.html">2274</a></td>
@@ -13470,7 +13470,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2277.html">2277</a></td>
     <td>CD5</td>
     <td>Ambiguity inheriting constructors with default arguments</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2278">
     <td><a href="https://cplusplus.github.io/CWG/issues/2278.html">2278</a></td>
@@ -13944,7 +13944,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2356.html">2356</a></td>
     <td>CD5</td>
     <td>Base class copy and move constructors should not be inherited</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2357">
     <td><a href="https://cplusplus.github.io/CWG/issues/2357.html">2357</a></td>
@@ -14832,7 +14832,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
     <td>DR</td>
     <td>Inheriting constructors from virtual base classes</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2505">
     <td><a href="https://cplusplus.github.io/CWG/issues/2505.html">2505</a></td>
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 416777171d2c..423171532c20 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -20,6 +20,7 @@ set(VE ve)
 if(APPLE)
   set(ARM64 arm64)
   set(ARM32 armv7 armv7s armv7k)
+  set(ARM64_32 arm64_32)
   set(X86_64 x86_64 x86_64h)
 endif()
 
@@ -29,7 +30,7 @@ set(ALL_SANITIZER_COMMON_SUPPORTED_ARCH ${X86} ${X86_64} ${PPC64} ${RISCV64}
 set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${LOONGARCH64})
-set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64})
+set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM64_32})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${LOONGARCH64})
 
 if(ANDROID)
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index c3d260d1a61f..d3398ff004e2 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -40,7 +40,7 @@ class ScopedReport {
  public:
   explicit ScopedReport(bool fatal) : fatal(fatal) {
     Lock lock(&error_message_lock_);
-    error_message_ptr_ = fatal ? &error_message_ : nullptr;
+    error_message_ptr_ = &error_message_;
     ++hwasan_report_count;
   }
 
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 867ae73f0d3b..f3b457d786e6 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -677,6 +677,7 @@ static void initializeProfileForContinuousMode(void) {
       PROF_ERR("Continuous counter sync mode is enabled, but raw profile is not"
                "page-aligned. CurrentFileOffset = %" PRIu64 ", pagesz = %u.\n",
                (uint64_t)CurrentFileOffset, PageSize);
+      fclose(File);
       return;
     }
     if (writeProfileWithFileObject(Filename, File) != 0) {
@@ -692,6 +693,8 @@ static void initializeProfileForContinuousMode(void) {
 
   if (doMerging()) {
     lprofUnlockFileHandle(File);
+  }
+  if (File != NULL) {
     fclose(File);
   }
 }
diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md
index 62459e6a7b70..45485ef40106 100644
--- a/flang/docs/FortranLLVMTestSuite.md
+++ b/flang/docs/FortranLLVMTestSuite.md
@@ -12,12 +12,6 @@ first-time users read through [LLVM Test Suite
 Guide](https://llvm.org/docs/TestSuiteGuide.html) which describes the
 organizational structure of the test suite and how to run it.
 
-Although the Flang driver is unable to generate code at this time, we
-are neverthelesss incrementally adding Fortran tests into the LLVM
-Test Suite. We are currently testing against GFortran while we make
-progress towards completing the new Flang driver with full
-code-generation capabilities.
-
 ## Running the LLVM test-suite with Fortran
 
 Fortran support can be enabled by setting the following CMake variables:
@@ -63,3 +57,12 @@ cmake -G "Ninja" -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \
     -DTEST_SUITE_FORTRAN:STRING=ON \
     -DTEST_SUITE_SPEC2017_ROOT=<path to SPEC directory>  ..
 ```
+
+## Running the gfortran tests
+
+Tests from the gfortran test suite have been imported into the LLVM Test Suite.
+The tests will be run automatically if the test suite is built following the
+instructions described [above](#running-the-llvm-test-suite-with-fortran).
+There are additional configure-time options that can be used with the gfortran 
+tests. More details about those options and their purpose can be found in 
+[`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md)`.
diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index 09f10fac7cf1..c98079a0e928 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -44,17 +44,21 @@ To understand the status of various developments in Flang please join the respec
 
 ### Flang Community Biweekly Call
 
--   General updates on the Flang Project, both LLVM Flang and current Flang.
+-   General updates on the Flang project.
 -   Join [Flang Community Biweekly Call](https://lanl-us.webex.com/lanl-us/j.php?MTID=mdce13c9bd55202e8071d8128fb953614)
--   Time: On Wednesdays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
+    - Meeting number: 177 400 7047
+    - Password: 6PCdCYKmN43
+-   Time: Wednesdays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
 -   Meeting minutes are available in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/edit).
 -   Minutes from older meetings were posted on the [Flang forum](https://discourse.llvm.org/c/subprojects/flang). Search for `Flang Biweekly Sync - Notes`.
 
 ### Flang Community Technical Biweekly Call
 
 -   Technical topics call.
--   Join [Flang Community Technical Biweekly Call](https://bluejeans.com/284530575)
--   Time: On Mondays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call.
+-   Join [Flang Community Technical Biweekly Call](https://teams.microsoft.com/l/meetup-join/19%3ameeting_YWU1NzU4ZjQtOTljOS00NWU1LTg5NjktYTUzOTU3MGEwMzAx%40thread.v2/0?context=%7b%22Tid%22%3a%22f34e5979-57d9-4aaa-ad4d-b122a662184d%22%2c%22Oid%22%3a%223641875c-ef5b-4767-8105-0787a195852f%22%7d)
+    - Meeting ID: 330 223 778 657
+    - Passcode: 7GjaJ3
+-   Time: Mondays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call.
 -   The agenda is in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/).
 
 ### LLVM Alias Analysis Technical Call
diff --git a/flang/docs/index.md b/flang/docs/index.md
index ff8f4a22d5eb..d974a3628b01 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -68,6 +68,7 @@ on how to get in touch with us and to learn more about the current status.
    OpenACC
    OpenACC-descriptor-management.md
    OpenMP-4.5-grammar.md
+   OpenMP-descriptor-management
    OpenMP-semantics
    OptionComparison
    Overview
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index fd4af157f793..04a0d71e1ade 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -229,6 +229,7 @@ struct DummyDataObject {
   static std::optional<DummyDataObject> Characterize(
       const semantics::Symbol &, FoldingContext &);
   bool CanBePassedViaImplicitInterface(std::string *whyNot = nullptr) const;
+  bool IsPassedByDescriptor(bool isBindC) const;
   llvm::raw_ostream &Dump(llvm::raw_ostream &) const;
 
   TypeAndShape type;
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 0ff3ca9bdeac..b13bb412f0f3 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -137,6 +137,12 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext,
                           fir::FortranVariableFlagsEnum extraFlags =
                               fir::FortranVariableFlagsEnum::None);
 
+/// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute
+/// representation.
+fir::CUDADataAttributeAttr
+translateSymbolCUDADataAttribute(mlir::MLIRContext *mlirContext,
+                                 const Fortran::semantics::Symbol &sym);
+
 /// Map a symbol to a given fir::ExtendedValue. This will generate an
 /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the
 /// symbol.
diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
index d2b7a423d778..e6fe7fb2276d 100644
--- a/flang/include/flang/Lower/OpenMP.h
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -72,6 +72,7 @@ int64_t getCollapseValue(const Fortran::parser::OmpClauseList &clauseList);
 void genThreadprivateOp(AbstractConverter &, const pft::Variable &);
 void genDeclareTargetIntGlobal(AbstractConverter &, const pft::Variable &);
 void genOpenMPReduction(AbstractConverter &,
+                        Fortran::semantics::SemanticsContext &,
                         const Fortran::parser::OmpClauseList &clauseList);
 
 mlir::Operation *findReductionChain(mlir::Value, mlir::Value * = nullptr);
@@ -82,6 +83,7 @@ void removeStoreOp(mlir::Operation *, mlir::Value);
 
 bool isOpenMPTargetConstruct(const parser::OpenMPConstruct &);
 bool isOpenMPDeviceDeclareTarget(Fortran::lower::AbstractConverter &,
+                                 Fortran::semantics::SemanticsContext &,
                                  Fortran::lower::pft::Evaluation &,
                                  const parser::OpenMPDeclarativeConstruct &);
 void genOpenMPRequires(mlir::Operation *, const Fortran::semantics::Symbol *);
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 5384f6e8121e..39821f1036c6 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -230,12 +230,14 @@ public:
                              llvm::StringRef name,
                              mlir::StringAttr linkage = {},
                              mlir::Attribute value = {}, bool isConst = false,
-                             bool isTarget = false);
+                             bool isTarget = false,
+                             fir::CUDADataAttributeAttr cudaAttr = {});
 
   fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type,
                              llvm::StringRef name, bool isConst, bool isTarget,
                              std::function<void(FirOpBuilder &)> bodyBuilder,
-                             mlir::StringAttr linkage = {});
+                             mlir::StringAttr linkage = {},
+                             fir::CUDADataAttributeAttr cudaAttr = {});
 
   /// Create a global constant (read-only) value.
   fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type,
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index efbd57c77de5..170e134baef6 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -233,11 +233,11 @@ translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
                          fir::FortranVariableOpInterface fortranVariable);
 
 /// Generate declaration for a fir::ExtendedValue in memory.
-fir::FortranVariableOpInterface genDeclare(mlir::Location loc,
-                                           fir::FirOpBuilder &builder,
-                                           const fir::ExtendedValue &exv,
-                                           llvm::StringRef name,
-                                           fir::FortranVariableFlagsAttr flags);
+fir::FortranVariableOpInterface
+genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
+           const fir::ExtendedValue &exv, llvm::StringRef name,
+           fir::FortranVariableFlagsAttr flags,
+           fir::CUDADataAttributeAttr cudaAttr = {});
 
 /// Generate an hlfir.associate to build a variable from an expression value.
 /// The type of the variable must be provided so that scalar logicals are
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 114bf7d1df91..66d6cd471116 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -55,7 +55,7 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> {
   let returnType = "::fir::FortranVariableFlagsEnum";
   let convertFromStorage = "$_self.getFlags()";
   let constBuilderCall =
-          "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)";
+        "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)";
 }
 
 def fir_BoxFieldAttr : I32EnumAttr<
@@ -67,8 +67,74 @@ def fir_BoxFieldAttr : I32EnumAttr<
   let cppNamespace = "fir";
 }
 
-
 // mlir::SideEffects::Resource for modelling operations which add debugging information
 def DebuggingResource : Resource<"::fir::DebuggingResource">;
 
+//===----------------------------------------------------------------------===//
+// CUDA Fortran specific attributes
+//===----------------------------------------------------------------------===//
+
+def fir_CUDADataAttribute : I32EnumAttr<
+    "CUDADataAttribute",
+    "CUDA Fortran variable attributes",
+    [
+      I32EnumAttrCase<"Constant", 0, "constant">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"Managed", 2, "managed">,
+      I32EnumAttrCase<"Pinned", 3, "pinned">,
+      I32EnumAttrCase<"Shared", 4, "shared">,
+      I32EnumAttrCase<"Unified", 5, "unified">,
+      // Texture is omitted since it is obsolete and rejected by semantic.
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
+}
+
+def fir_CUDADataAttributeAttr :
+    EnumAttr<fir_Dialect, fir_CUDADataAttribute, "cuda"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
+def fir_CUDAProcAttribute : I32EnumAttr<
+    "CUDAProcAttribute", "CUDA Fortran procedure attributes",
+    [
+      I32EnumAttrCase<"Host", 0, "host">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"HostDevice", 2, "host_device">,
+      I32EnumAttrCase<"Global", 3, "global">,
+      I32EnumAttrCase<"GridGlobal", 4, "grid_global">,
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
+}
+
+def fir_CUDAProcAttributeAttr :
+    EnumAttr<fir_Dialect, fir_CUDAProcAttribute, "cuda_proc"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
+def fir_CUDALaunchBoundsAttr : fir_Attr<"CUDALaunchBounds"> {
+  let mnemonic = "launch_bounds";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$maxTPB,
+    "mlir::IntegerAttr":$minBPM,
+    OptionalParameter<"mlir::IntegerAttr">:$upperBoundClusterSize
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
+  let mnemonic = "cluster_dims";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$x,
+    "mlir::IntegerAttr":$y,
+    "mlir::IntegerAttr":$z
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index fcecc605dfa5..29bd100164b7 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2433,7 +2433,6 @@ def fir_DispatchOp : fir_Op<"dispatch", []> {
     // operand[0] is the object (of class type)
     operand_iterator arg_operand_begin() { return operand_begin() + 1; }
     operand_iterator arg_operand_end() { return operand_end(); }
-    static constexpr llvm::StringRef getMethodAttrNameStr() { return "method"; }
   }];
 }
 
@@ -2737,7 +2736,8 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     OptionalAttr<AnyAttr>:$initVal,
     OptionalAttr<UnitAttr>:$constant,
     OptionalAttr<UnitAttr>:$target,
-    OptionalAttr<StrAttr>:$linkName
+    OptionalAttr<StrAttr>:$linkName,
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let regions = (region AtMostRegion<1>:$region);
@@ -2768,17 +2768,6 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
-    static constexpr llvm::StringRef getSymbolAttrNameStr() { return "symref"; }
-    static constexpr llvm::StringRef getConstantAttrNameStr() {
-      return "constant";
-    }
-    static constexpr llvm::StringRef getLinkageAttrNameStr() {
-      return "linkName";
-    }
-    static constexpr llvm::StringRef getTargetAttrNameStr() {
-      return "target";
-    }
-
     /// The semantic type of the global
     mlir::Type resultType();
 
@@ -2932,7 +2921,6 @@ def fir_DTEntryOp : fir_Op<"dt_entry", [HasParent<"TypeInfoOp">]> {
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
-    static constexpr llvm::StringRef getMethodAttrNameStr() { return "method"; }
     static constexpr llvm::StringRef getProcAttrNameStr() { return "proc"; }
   }];
 }
@@ -3027,7 +3015,8 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
-    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs
+    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyRefOrBox);
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 977949e399a5..e8226b6df58c 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -72,6 +72,19 @@ constexpr llvm::StringRef getOptionalAttrName() { return "fir.optional"; }
 /// Attribute to mark Fortran entities with the TARGET attribute.
 static constexpr llvm::StringRef getTargetAttrName() { return "fir.target"; }
 
+/// Attribute to mark Fortran entities with the CUDA attribute.
+static constexpr llvm::StringRef getCUDAAttrName() { return "fir.cuda_attr"; }
+
+/// Attribute to carry CUDA launch_bounds values.
+static constexpr llvm::StringRef getCUDALaunchBoundsAttrName() {
+  return "fir.cuda_launch_bounds";
+}
+
+/// Attribute to carry CUDA cluster_dims values.
+static constexpr llvm::StringRef getCUDAClusterDimsAttrName() {
+  return "fir.cuda_cluster_dims";
+}
+
 /// Attribute to mark that a function argument is a character dummy procedure.
 /// Character dummy procedure have special ABI constraints.
 static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 753ede2112a4..c82eae154d31 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -88,7 +88,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
     Builtin_StringAttr:$uniq_name,
-    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs
+    OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
+    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
   );
 
   let results = (outs AnyFortranVariable, AnyRefOrBoxLike);
@@ -101,7 +102,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
   let builders = [
     OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name,
       CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams,
-      CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs)>];
+      CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
+      CArg<"fir::CUDADataAttributeAttr", "{}">:$cuda_attr)>];
 
   let extraClassDeclaration = [{
     /// Get the variable original base (same as input). It lacks
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index b50f297a7d31..7a8a34c25ce9 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -18,7 +18,6 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -135,142 +134,61 @@ inline void intrinsicTypeTODO(fir::FirOpBuilder &builder, mlir::Type type,
            " in " + intrinsicName);
 }
 
-using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
-    mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>;
-using InitValGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>;
-using AddrGeneratorTy = llvm::function_ref<mlir::Value(
-    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
-    mlir::Value)>;
-
-// Produces a loop nest for a Minloc intrinsic.
-inline void genMinMaxlocReductionLoop(
-    fir::FirOpBuilder &builder, mlir::Value array,
-    fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
-    fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
-    mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
-    bool maskMayBeLogicalScalar) {
-  mlir::IndexType idxTy = builder.getIndexType();
-
-  mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
-
-  fir::SequenceType::Shape flatShape(rank,
-                                     fir::SequenceType::getUnknownExtent());
-  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
-  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);
-
-  mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
-  mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
-  mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
-  mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
-  builder.create<fir::StoreOp>(loc, zero, flagRef);
-
-  mlir::Value init = initVal(builder, loc, elementType);
-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;
-
-  assert(rank > 0 && "rank cannot be zero");
-  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-
-  // Compute all the upper bounds before the loop nest.
-  // It is not strictly necessary for performance, since the loop nest
-  // does not have any store operations and any LICM optimization
-  // should be able to optimize the redundancy.
-  for (unsigned i = 0; i < rank; ++i) {
-    mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
-    auto dims =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
-    mlir::Value len = dims.getResult(1);
-    // We use C indexing here, so len-1 as loopcount
-    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
-    bounds.push_back(loopCount);
-  }
-  // Create a loop nest consisting of OP operations.
-  // Collect the loops' induction variables into indices array,
-  // which will be used in the innermost loop to load the input
-  // array's element.
-  // The loops are generated such that the innermost loop processes
-  // the 0 dimension.
-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
-  for (unsigned i = rank; 0 < i; --i) {
-    mlir::Value step = one;
-    mlir::Value loopCount = bounds[i - 1];
-    auto loop =
-        builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
-                                      /*finalCountValue=*/false, init);
-    init = loop.getRegionIterArgs()[0];
-    indices.push_back(loop.getInductionVar());
-    // Set insertion point to the loop body so that the next loop
-    // is inserted inside the current one.
-    builder.setInsertionPointToStart(loop.getBody());
-  }
-
-  // Reverse the indices such that they are ordered as:
-  //   <dim-0-idx, dim-1-idx, ...>
-  std::reverse(indices.begin(), indices.end());
-  mlir::Value reductionVal =
-      genBody(builder, loc, elementType, array, flagRef, init, indices);
-
-  // Unwind the loop nest and insert ResultOp on each level
-  // to return the updated value of the reduction to the enclosing
-  // loops.
-  for (unsigned i = 0; i < rank; ++i) {
-    auto result = builder.create<fir::ResultOp>(loc, reductionVal);
-    // Proceed to the outer loop.
-    auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
-    reductionVal = loop.getResult(0);
-    // Set insertion point after the loop operation that we have
-    // just processed.
-    builder.setInsertionPointAfter(loop.getOperation());
-  }
-  // End of loop nest. The insertion point is after the outermost loop.
-  if (maskMayBeLogicalScalar) {
-    if (fir::IfOp ifOp =
-            mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
-      builder.create<fir::ResultOp>(loc, reductionVal);
-      builder.setInsertionPointAfter(ifOp);
-      // Redefine flagSet to escape scope of ifOp
-      flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
-      reductionVal = ifOp.getResult(0);
+inline fir::CUDADataAttributeAttr
+getCUDADataAttribute(mlir::MLIRContext *mlirContext,
+                     std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
+  if (cudaAttr) {
+    fir::CUDADataAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDADataAttr::Constant:
+      attr = fir::CUDADataAttribute::Constant;
+      break;
+    case Fortran::common::CUDADataAttr::Device:
+      attr = fir::CUDADataAttribute::Device;
+      break;
+    case Fortran::common::CUDADataAttr::Managed:
+      attr = fir::CUDADataAttribute::Managed;
+      break;
+    case Fortran::common::CUDADataAttr::Pinned:
+      attr = fir::CUDADataAttribute::Pinned;
+      break;
+    case Fortran::common::CUDADataAttr::Shared:
+      attr = fir::CUDADataAttribute::Shared;
+      break;
+    case Fortran::common::CUDADataAttr::Texture:
+      // Obsolete attribute
+      return {};
     }
+    return fir::CUDADataAttributeAttr::get(mlirContext, attr);
   }
+  return {};
+}
 
-  // Check for case where array was full of max values.
-  // flag will be 0 if mask was never true, 1 if mask was true as some point,
-  // this is needed to avoid catching cases where we didn't access any elements
-  // e.g. mask=.FALSE.
-  mlir::Value flagValue =
-      builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
-  mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
-  fir::IfOp ifMaskTrueOp =
-      builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
-  builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());
-
-  mlir::Value testInit = initVal(builder, loc, elementType);
-  fir::IfOp ifMinSetOp;
-  if (elementType.isa<mlir::FloatType>()) {
-    mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
-    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
-                                           /*withElseRegion*/ false);
-  } else {
-    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
-    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
-                                           /*withElseRegion*/ false);
-  }
-  builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());
-
-  // Load output array with 1s instead of 0s
-  for (unsigned int i = 0; i < rank; ++i) {
-    mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
-    mlir::Value resultElemAddr =
-        getAddrFn(builder, loc, resultElemType, resultArr, index);
-    builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
+inline fir::CUDAProcAttributeAttr getCUDAProcAttribute(
+    mlir::MLIRContext *mlirContext,
+    std::optional<Fortran::common::CUDASubprogramAttrs> cudaAttr) {
+  if (cudaAttr) {
+    fir::CUDAProcAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDASubprogramAttrs::Host:
+      attr = fir::CUDAProcAttribute::Host;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Device:
+      attr = fir::CUDAProcAttribute::Device;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::HostDevice:
+      attr = fir::CUDAProcAttribute::HostDevice;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Global:
+      attr = fir::CUDAProcAttribute::Global;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Grid_Global:
+      attr = fir::CUDAProcAttribute::GridGlobal;
+      break;
+    }
+    return fir::CUDAProcAttributeAttr::get(mlirContext, attr);
   }
-  builder.setInsertionPointAfter(ifMaskTrueOp);
+  return {};
 }
 
 } // namespace fir
diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
new file mode 100644
index 000000000000..49a616fb40fd
--- /dev/null
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -0,0 +1,38 @@
+//===-- Optimizer/Transforms/Utils.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
+#define FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
+
+namespace fir {
+
+using MinlocBodyOpGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
+    mlir::Value, mlir::Value, const llvm::SmallVectorImpl<mlir::Value> &)>;
+using InitValGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &)>;
+using AddrGeneratorTy = llvm::function_ref<mlir::Value(
+    fir::FirOpBuilder &, mlir::Location, const mlir::Type &, mlir::Value,
+    mlir::Value)>;
+
+// Produces a loop nest for a Minloc intrinsic.
+void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
+                               fir::InitValGeneratorTy initVal,
+                               fir::MinlocBodyOpGeneratorTy genBody,
+                               fir::AddrGeneratorTy getAddrFn, unsigned rank,
+                               mlir::Type elementType, mlir::Location loc,
+                               mlir::Type maskElemType, mlir::Value resultArr,
+                               bool maskMayBeLogicalScalar);
+
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index d480050d354f..80b0f346c32d 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -461,6 +461,34 @@ bool DummyDataObject::CanBePassedViaImplicitInterface(
   }
 }
 
+bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const {
+  constexpr TypeAndShape::Attrs shapeRequiringBox = {
+      TypeAndShape::Attr::AssumedShape, TypeAndShape::Attr::DeferredShape,
+      TypeAndShape::Attr::AssumedRank, TypeAndShape::Attr::Coarray};
+  if ((attrs & Attrs{Attr::Allocatable, Attr::Pointer}).any()) {
+    return true;
+  } else if ((type.attrs() & shapeRequiringBox).any()) {
+    // Need to pass shape/coshape info in a descriptor.
+    return true;
+  } else if (type.type().IsPolymorphic() && !type.type().IsAssumedType()) {
+    // Need to pass dynamic type info in a descriptor.
+    return true;
+  } else if (const auto *derived{GetDerivedTypeSpec(type.type())}) {
+    if (!derived->parameters().empty()) {
+      for (const auto &param : derived->parameters()) {
+        if (param.second.isLen()) {
+          // Need to pass length type parameters in a descriptor.
+          return true;
+        }
+      }
+    }
+  } else if (isBindC && type.type().IsAssumedLengthCharacter()) {
+    // Fortran 2018 18.3.6 point 2 (5)
+    return true;
+  }
+  return false;
+}
+
 llvm::raw_ostream &DummyDataObject::Dump(llvm::raw_ostream &o) const {
   attrs.Dump(o, EnumToString);
   if (intent != common::Intent::Default) {
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index ffde7f50087e..4707de0e976c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1326,10 +1326,23 @@ void CompilerInvocation::setDefaultPredefinitions() {
     Fortran::common::setOpenMPMacro(getLangOpts().OpenMPVersion,
                                     fortranOptions.predefinitions);
   }
+
   llvm::Triple targetTriple{llvm::Triple(this->targetOpts.triple)};
-  if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
+  switch (targetTriple.getArch()) {
+  default:
+    break;
+  case llvm::Triple::ArchType::x86_64:
     fortranOptions.predefinitions.emplace_back("__x86_64__", "1");
     fortranOptions.predefinitions.emplace_back("__x86_64", "1");
+    break;
+  case llvm::Triple::ArchType::ppc:
+  case llvm::Triple::ArchType::ppcle:
+  case llvm::Triple::ArchType::ppc64:
+  case llvm::Triple::ArchType::ppc64le:
+    // '__powerpc__' is a generic macro for any PowerPC cases. e.g. Max integer
+    // size.
+    fortranOptions.predefinitions.emplace_back("__powerpc__", "1");
+    break;
   }
 }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 579f94ba7568..76e127207d76 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2470,7 +2470,8 @@ private:
     // found
     ompDeviceCodeFound =
         ompDeviceCodeFound ||
-        Fortran::lower::isOpenMPDeviceDeclareTarget(*this, getEval(), ompDecl);
+        Fortran::lower::isOpenMPDeviceDeclareTarget(
+            *this, bridge.getSemanticsContext(), getEval(), ompDecl);
     genOpenMPDeclarativeConstruct(
         *this, localSymbols, bridge.getSemanticsContext(), getEval(), ompDecl);
     builder->restoreInsertionPoint(insertPt);
@@ -3115,10 +3116,10 @@ private:
         hlfir::Entity nullBoxProc(
             fir::factory::createNullBoxProc(*builder, loc, boxTy));
         builder->createStoreWithConvert(loc, nullBoxProc, pptr);
-        return;
+      } else {
+        fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
+        fir::factory::disassociateMutableBox(*builder, loc, box);
       }
-      fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
-      fir::factory::disassociateMutableBox(*builder, loc, box);
     }
   }
 
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index b007c958cb6b..6b71aabf7fdc 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Lower/CallInterface.h"
+#include "flang/Common/Fortran.h"
 #include "flang/Evaluate/fold.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/Mangler.h"
@@ -19,6 +20,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include <optional>
@@ -522,6 +524,57 @@ static void addSymbolAttribute(mlir::func::FuncOp func,
                 mlir::StringAttr::get(&mlirContext, name));
 }
 
+static void
+setCUDAAttributes(mlir::func::FuncOp func,
+                  const Fortran::semantics::Symbol *sym,
+                  std::optional<Fortran::evaluate::characteristics::Procedure>
+                      characteristic) {
+  if (characteristic && characteristic->cudaSubprogramAttrs) {
+    func.getOperation()->setAttr(
+        fir::getCUDAAttrName(),
+        fir::getCUDAProcAttribute(func.getContext(),
+                                  *characteristic->cudaSubprogramAttrs));
+  }
+
+  if (sym) {
+    if (auto details =
+            sym->GetUltimate()
+                .detailsIf<Fortran::semantics::SubprogramDetails>()) {
+      mlir::Type i64Ty = mlir::IntegerType::get(func.getContext(), 64);
+      if (!details->cudaLaunchBounds().empty()) {
+        assert(details->cudaLaunchBounds().size() >= 2 &&
+               "expect at least 2 values");
+        auto maxTPBAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[0]);
+        auto minBPMAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[1]);
+        mlir::IntegerAttr ubAttr;
+        if (details->cudaLaunchBounds().size() > 2)
+          ubAttr =
+              mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[2]);
+        func.getOperation()->setAttr(
+            fir::getCUDALaunchBoundsAttrName(),
+            fir::CUDALaunchBoundsAttr::get(func.getContext(), maxTPBAttr,
+                                           minBPMAttr, ubAttr));
+      }
+
+      if (!details->cudaClusterDims().empty()) {
+        assert(details->cudaClusterDims().size() == 3 && "expect 3 values");
+        auto xAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[0]);
+        auto yAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[1]);
+        auto zAttr =
+            mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[2]);
+        func.getOperation()->setAttr(
+            fir::getCUDAClusterDimsAttrName(),
+            fir::CUDAClusterDimsAttr::get(func.getContext(), xAttr, yAttr,
+                                          zAttr));
+      }
+    }
+  }
+}
+
 /// Declare drives the different actions to be performed while analyzing the
 /// signature and building/finding the mlir::func::FuncOp.
 template <typename T>
@@ -557,6 +610,8 @@ void Fortran::lower::CallInterface<T>::declare() {
         if (!placeHolder.value().attributes.empty())
           func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes);
       side().setFuncAttrs(func);
+
+      setCUDAAttributes(func, side().getProcedureSymbol(), characteristic);
     }
   }
 }
@@ -915,31 +970,6 @@ private:
     }
   }
 
-  // Define when an explicit argument must be passed in a fir.box.
-  bool dummyRequiresBox(
-      const Fortran::evaluate::characteristics::DummyDataObject &obj,
-      bool isBindC) {
-    using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
-    using ShapeAttrs = Fortran::evaluate::characteristics::TypeAndShape::Attrs;
-    constexpr ShapeAttrs shapeRequiringBox = {
-        ShapeAttr::AssumedShape, ShapeAttr::DeferredShape,
-        ShapeAttr::AssumedRank, ShapeAttr::Coarray};
-    if ((obj.type.attrs() & shapeRequiringBox).any())
-      // Need to pass shape/coshape info in fir.box.
-      return true;
-    if (obj.type.type().IsPolymorphic() && !obj.type.type().IsAssumedType())
-      // Need to pass dynamic type info in fir.box.
-      return true;
-    if (const Fortran::semantics::DerivedTypeSpec *derived =
-            Fortran::evaluate::GetDerivedTypeSpec(obj.type.type()))
-      if (const Fortran::semantics::Scope *scope = derived->scope())
-        // Need to pass length type parameters in fir.box if any.
-        return scope->IsDerivedTypeWithLengthParameter();
-    if (isBindC && obj.type.type().IsAssumedLengthCharacter())
-      return true; // Fortran 2018 18.3.6 point 2 (5)
-    return false;
-  }
-
   mlir::Type
   translateDynamicType(const Fortran::evaluate::DynamicType &dynamicType) {
     Fortran::common::TypeCategory cat = dynamicType.category();
@@ -993,6 +1023,10 @@ private:
       TODO(loc, "VOLATILE in procedure interface");
     if (obj.attrs.test(Attrs::Target))
       addMLIRAttr(fir::getTargetAttrName());
+    if (obj.cudaDataAttr)
+      attrs.emplace_back(
+          mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()),
+          fir::getCUDADataAttribute(&mlirContext, obj.cudaDataAttr));
 
     // TODO: intents that require special care (e.g finalization)
 
@@ -1022,7 +1056,7 @@ private:
       addFirOperand(boxRefType, nextPassedArgPosition(), Property::MutableBox,
                     attrs);
       addPassedArg(PassEntityBy::MutableBox, entity, characteristics);
-    } else if (dummyRequiresBox(obj, isBindC)) {
+    } else if (obj.IsPassedByDescriptor(isBindC)) {
       // Pass as fir.box or fir.class
       if (isValueAttr &&
           !getConverter().getLoweringOptions().getLowerToHighLevelFIR())
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index f60cdbb690e7..d8271b1f1463 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -922,7 +922,8 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Handle procedure arguments (procedure pointers should go through
   // prepareProcedurePointerActualArgument).
   if (hlfir::isFortranProcedureValue(dummyType)) {
-    // Procedure pointer actual to procedure dummy.
+    // Procedure pointer or function returns procedure pointer actual to
+    // procedure dummy.
     if (actual.isProcedurePointer()) {
       actual = hlfir::derefPointersAndAllocatables(loc, builder, actual);
       return PreparedDummyArgument{actual, /*cleanups=*/{}};
@@ -931,7 +932,11 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     assert(actual.isProcedure());
     // Do nothing if this is a procedure argument. It is already a
     // fir.boxproc/fir.tuple<fir.boxproc, len> as it should.
-    if (actual.getType() != dummyType)
+    if (!actual.getType().isa<fir::BoxProcType>() &&
+        actual.getType() != dummyType)
+      // The actual argument may be a procedure that returns character (a
+      // fir.tuple<fir.boxproc, len>) while the dummy is not. Extract the tuple
+      // in that case.
       actual = fixProcedureDummyMismatch(loc, builder, actual, dummyType);
     return PreparedDummyArgument{actual, /*cleanups=*/{}};
   }
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 8ea2557c42b3..b2279a319fe9 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -37,6 +37,7 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/runtime-type-info.h"
 #include "flang/Semantics/tools.h"
 #include "llvm/Support/Debug.h"
@@ -137,7 +138,8 @@ static bool isConstant(const Fortran::semantics::Symbol &sym) {
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
-                                  mlir::StringAttr linkage);
+                                  mlir::StringAttr linkage,
+                                  fir::CUDADataAttributeAttr cudaAttr = {});
 
 static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::semantics::Symbol &sym) {
@@ -461,7 +463,8 @@ void Fortran::lower::createGlobalInitialization(
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
-                                  mlir::StringAttr linkage) {
+                                  mlir::StringAttr linkage,
+                                  fir::CUDADataAttributeAttr cudaAttr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const Fortran::semantics::Symbol &sym = var.getSymbol();
   mlir::Location loc = genLocation(converter, sym);
@@ -499,8 +502,9 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
     }
   }
   if (!global)
-    global = builder.createGlobal(loc, symTy, globalName, linkage,
-                                  mlir::Attribute{}, isConst, var.isTarget());
+    global =
+        builder.createGlobal(loc, symTy, globalName, linkage, mlir::Attribute{},
+                             isConst, var.isTarget(), cudaAttr);
   if (Fortran::semantics::IsAllocatableOrPointer(sym) &&
       !Fortran::semantics::IsProcedure(sym)) {
     const auto *details =
@@ -1579,6 +1583,13 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
   return fir::FortranVariableFlagsAttr::get(mlirContext, flags);
 }
 
+fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute(
+    mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
+  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+      Fortran::semantics::GetCUDADataAttr(&sym);
+  return fir::getCUDADataAttribute(mlirContext, cudaAttr);
+}
+
 /// Map a symbol to its FIR address and evaluated specification expressions.
 /// Not for symbols lowered to fir.box.
 /// Will optionally create fir.declare.
@@ -1618,6 +1629,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
     auto name = converter.mangleName(sym);
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(builder.getContext(), sym);
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym);
 
     if (isCrayPointee) {
       mlir::Type baseType =
@@ -1664,7 +1678,7 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       return;
     }
     auto newBase = builder.create<hlfir::DeclareOp>(
-        loc, base, name, shapeOrShift, lenParams, attributes);
+        loc, base, name, shapeOrShift, lenParams, attributes, cudaAttr);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -1709,9 +1723,12 @@ void Fortran::lower::genDeclareSymbol(
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(
             builder.getContext(), sym.GetUltimate(), extraFlags);
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym.GetUltimate());
     auto name = converter.mangleName(sym);
     hlfir::EntityWithAttributes declare =
-        hlfir::genDeclare(loc, builder, exv, name, attributes);
+        hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr);
     symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force);
     return;
   }
@@ -2206,7 +2223,10 @@ void Fortran::lower::defineModuleVariable(
     // Do nothing. Mapping will be done on user side.
   } else {
     std::string globalName = converter.mangleName(sym);
-    defineGlobal(converter, var, globalName, linkage);
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(
+            converter.getFirOpBuilder().getContext(), sym);
+    defineGlobal(converter, var, globalName, linkage, cudaAttr);
   }
 }
 
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index bd880376517d..8d560db34e05 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -52,10 +52,13 @@ namespace lower {
 /// operations.
 struct AddrAndBoundsInfo {
   explicit AddrAndBoundsInfo() {}
-  explicit AddrAndBoundsInfo(mlir::Value addr) : addr(addr) {}
-  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value isPresent)
-      : addr(addr), isPresent(isPresent) {}
+  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput)
+      : addr(addr), rawInput(rawInput) {}
+  explicit AddrAndBoundsInfo(mlir::Value addr, mlir::Value rawInput,
+                             mlir::Value isPresent)
+      : addr(addr), rawInput(rawInput), isPresent(isPresent) {}
   mlir::Value addr = nullptr;
+  mlir::Value rawInput = nullptr;
   mlir::Value isPresent = nullptr;
 };
 
@@ -615,20 +618,30 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
                        fir::FirOpBuilder &builder,
                        Fortran::lower::SymbolRef sym, mlir::Location loc) {
   mlir::Value symAddr = converter.getSymbolAddress(sym);
+  mlir::Value rawInput = symAddr;
   if (auto declareOp =
-          mlir::dyn_cast_or_null<hlfir::DeclareOp>(symAddr.getDefiningOp()))
+          mlir::dyn_cast_or_null<hlfir::DeclareOp>(symAddr.getDefiningOp())) {
     symAddr = declareOp.getResults()[0];
+    rawInput = declareOp.getResults()[1];
+  }
 
   // TODO: Might need revisiting to handle for non-shared clauses
   if (!symAddr) {
     if (const auto *details =
-            sym->detailsIf<Fortran::semantics::HostAssocDetails>())
+            sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
       symAddr = converter.getSymbolAddress(details->symbol());
+      rawInput = symAddr;
+    }
   }
 
   if (!symAddr)
     llvm::report_fatal_error("could not retrieve symbol address");
 
+  mlir::Value isPresent;
+  if (Fortran::semantics::IsOptional(sym))
+    isPresent =
+        builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), rawInput);
+
   if (auto boxTy =
           fir::unwrapRefType(symAddr.getType()).dyn_cast<fir::BaseBoxType>()) {
     if (boxTy.getEleTy().isa<fir::RecordType>())
@@ -638,8 +651,6 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
     // `fir.ref<fir.class<T>>` type.
     if (symAddr.getType().isa<fir::ReferenceType>()) {
       if (Fortran::semantics::IsOptional(sym)) {
-        mlir::Value isPresent =
-            builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), symAddr);
         mlir::Value addr =
             builder.genIfOp(loc, {boxTy}, isPresent, /*withElseRegion=*/true)
                 .genThen([&]() {
@@ -652,14 +663,13 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
                   builder.create<fir::ResultOp>(loc, mlir::ValueRange{absent});
                 })
                 .getResults()[0];
-        return AddrAndBoundsInfo(addr, isPresent);
+        return AddrAndBoundsInfo(addr, rawInput, isPresent);
       }
       mlir::Value addr = builder.create<fir::LoadOp>(loc, symAddr);
-      return AddrAndBoundsInfo(addr);
-      ;
+      return AddrAndBoundsInfo(addr, rawInput, isPresent);
     }
   }
-  return AddrAndBoundsInfo(symAddr);
+  return AddrAndBoundsInfo(symAddr, rawInput, isPresent);
 }
 
 template <typename BoundsOp, typename BoundsType>
@@ -807,7 +817,7 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
              Fortran::lower::StatementContext &stmtCtx,
              const std::list<Fortran::parser::SectionSubscript> &subscripts,
              std::stringstream &asFortran, fir::ExtendedValue &dataExv,
-             bool dataExvIsAssumedSize, mlir::Value baseAddr,
+             bool dataExvIsAssumedSize, AddrAndBoundsInfo &info,
              bool treatIndexAsSection = false) {
   int dimension = 0;
   mlir::Type idxTy = builder.getIndexType();
@@ -831,11 +841,30 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value stride = one;
       bool strideInBytes = false;
 
-      if (fir::unwrapRefType(baseAddr.getType()).isa<fir::BaseBoxType>()) {
-        mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension);
-        auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                      baseAddr, d);
-        stride = dimInfo.getByteStride();
+      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+        if (info.isPresent) {
+          stride =
+              builder
+                  .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
+                  .genThen([&]() {
+                    mlir::Value d =
+                        builder.createIntegerConstant(loc, idxTy, dimension);
+                    auto dimInfo = builder.create<fir::BoxDimsOp>(
+                        loc, idxTy, idxTy, idxTy, info.addr, d);
+                    builder.create<fir::ResultOp>(loc, dimInfo.getByteStride());
+                  })
+                  .genElse([&] {
+                    mlir::Value zero =
+                        builder.createIntegerConstant(loc, idxTy, 0);
+                    builder.create<fir::ResultOp>(loc, zero);
+                  })
+                  .getResults()[0];
+        } else {
+          mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension);
+          auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy,
+                                                        idxTy, info.addr, d);
+          stride = dimInfo.getByteStride();
+        }
         strideInBytes = true;
       }
 
@@ -919,7 +948,26 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           }
         }
 
-        extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
+        if (info.isPresent &&
+            fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+          extent =
+              builder
+                  .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
+                  .genThen([&]() {
+                    mlir::Value ext = fir::factory::readExtent(
+                        builder, loc, dataExv, dimension);
+                    builder.create<fir::ResultOp>(loc, ext);
+                  })
+                  .genElse([&] {
+                    mlir::Value zero =
+                        builder.createIntegerConstant(loc, idxTy, 0);
+                    builder.create<fir::ResultOp>(loc, zero);
+                  })
+                  .getResults()[0];
+        } else {
+          extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
+        }
+
         if (dataExvIsAssumedSize && dimension + 1 == dataExvRank) {
           extent = zero;
           if (ubound && lbound) {
@@ -976,6 +1024,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   dataExv = converter.genExprAddr(operandLocation, *exprBase,
                                                   stmtCtx);
                   info.addr = fir::getBase(dataExv);
+                  info.rawInput = info.addr;
                   asFortran << (*exprBase).AsFortran();
                 } else {
                   const Fortran::parser::Name &name =
@@ -993,7 +1042,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   bounds = genBoundsOps<BoundsOp, BoundsType>(
                       builder, operandLocation, converter, stmtCtx,
                       arrayElement->subscripts, asFortran, dataExv,
-                      dataExvIsAssumedSize, info.addr, treatIndexAsSection);
+                      dataExvIsAssumedSize, info, treatIndexAsSection);
                 }
                 asFortran << ')';
               } else if (auto structComp = Fortran::parser::Unwrap<
@@ -1001,6 +1050,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 fir::ExtendedValue compExv =
                     converter.genExprAddr(operandLocation, *expr, stmtCtx);
                 info.addr = fir::getBase(compExv);
+                info.rawInput = info.addr;
                 if (fir::unwrapRefType(info.addr.getType())
                         .isa<fir::SequenceType>())
                   bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
@@ -1012,7 +1062,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                     *Fortran::parser::GetLastName(*structComp).symbol);
                 if (isOptional)
                   info.isPresent = builder.create<fir::IsPresentOp>(
-                      operandLocation, builder.getI1Type(), info.addr);
+                      operandLocation, builder.getI1Type(), info.rawInput);
 
                 if (auto loadOp = mlir::dyn_cast_or_null<fir::LoadOp>(
                         info.addr.getDefiningOp())) {
@@ -1020,6 +1070,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                       fir::isPointerType(loadOp.getType()))
                     info.addr = builder.create<fir::BoxAddrOp>(operandLocation,
                                                                info.addr);
+                  info.rawInput = info.addr;
                 }
 
                 // If the component is an allocatable or pointer the result of
@@ -1029,6 +1080,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 if (auto boxAddrOp = mlir::dyn_cast_or_null<fir::BoxAddrOp>(
                         info.addr.getDefiningOp())) {
                   info.addr = boxAddrOp.getVal();
+                  info.rawInput = info.addr;
                   bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
                       builder, operandLocation, converter, compExv, info);
                 }
@@ -1043,6 +1095,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   fir::ExtendedValue compExv =
                       converter.genExprAddr(operandLocation, *expr, stmtCtx);
                   info.addr = fir::getBase(compExv);
+                  info.rawInput = info.addr;
                   asFortran << (*expr).AsFortran();
                 } else if (const auto *dataRef{
                                std::get_if<Fortran::parser::DataRef>(
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 43f54c6d2a71..6ae270f63f5c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -67,9 +67,12 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value varPtrPtr;
   if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
     if (isPresent) {
+      mlir::Type ifRetTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(ifRetTy))
+        ifRetTy = fir::ReferenceType::get(ifRetTy);
       baseAddr =
           builder
-              .genIfOp(loc, {boxTy.getEleTy()}, isPresent,
+              .genIfOp(loc, {ifRetTy}, isPresent,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 mlir::Value boxAddr =
@@ -78,7 +81,7 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
               })
               .genElse([&] {
                 mlir::Value absent =
-                    builder.create<fir::AbsentOp>(loc, boxTy.getEleTy());
+                    builder.create<fir::AbsentOp>(loc, ifRetTy);
                 builder.create<fir::ResultOp>(loc, mlir::ValueRange{absent});
               })
               .getResults()[0];
@@ -295,9 +298,16 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                                        asFortran, bounds,
                                        /*treatIndexAsSection=*/true);
 
-    Op op = createDataEntryOp<Op>(
-        builder, operandLocation, info.addr, asFortran, bounds, structured,
-        implicit, dataClause, info.addr.getType(), info.isPresent);
+    // If the input value is optional and is not a descriptor, we use the
+    // rawInput directly.
+    mlir::Value baseAddr =
+        ((info.addr.getType() != fir::unwrapRefType(info.rawInput.getType())) &&
+         info.isPresent)
+            ? info.rawInput
+            : info.addr;
+    Op op = createDataEntryOp<Op>(builder, operandLocation, baseAddr, asFortran,
+                                  bounds, structured, implicit, dataClause,
+                                  baseAddr.getType(), info.isPresent);
     dataOperands.push_back(op.getAccPtr());
   }
 }
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index ad4cffc70753..06850bebd7d0 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -552,8 +552,9 @@ class ClauseProcessor {
 
 public:
   ClauseProcessor(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
-      : converter(converter), clauses(clauses) {}
+      : converter(converter), semaCtx(semaCtx), clauses(clauses) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
@@ -614,17 +615,18 @@ public:
   // target directives that require it.
   bool processMap(mlir::Location currentLocation,
                   const llvm::omp::Directive &directive,
-                  Fortran::semantics::SemanticsContext &semanticsContext,
                   Fortran::lower::StatementContext &stmtCtx,
                   llvm::SmallVectorImpl<mlir::Value> &mapOperands,
                   llvm::SmallVectorImpl<mlir::Type> *mapSymTypes = nullptr,
                   llvm::SmallVectorImpl<mlir::Location> *mapSymLocs = nullptr,
                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                       *mapSymbols = nullptr) const;
-  bool processReduction(
-      mlir::Location currentLocation,
-      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-      llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) const;
+  bool
+  processReduction(mlir::Location currentLocation,
+                   llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+                   llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+                       *reductionSymbols = nullptr) const;
   bool processSectionsReduction(mlir::Location currentLocation) const;
   bool processTo(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
   bool
@@ -641,10 +643,8 @@ public:
                           &useDeviceSymbols) const;
 
   template <typename T>
-  bool
-  processMotionClauses(Fortran::semantics::SemanticsContext &semanticsContext,
-                       Fortran::lower::StatementContext &stmtCtx,
-                       llvm::SmallVectorImpl<mlir::Value> &mapOperands);
+  bool processMotionClauses(Fortran::lower::StatementContext &stmtCtx,
+                            llvm::SmallVectorImpl<mlir::Value> &mapOperands);
 
   // Call this method for these clauses that should be supported but are not
   // implemented yet. It triggers a compilation error if any of the given
@@ -713,6 +713,7 @@ private:
   }
 
   Fortran::lower::AbstractConverter &converter;
+  Fortran::semantics::SemanticsContext &semaCtx;
   const Fortran::parser::OmpClauseList &clauses;
 };
 
@@ -731,21 +732,59 @@ static void checkMapType(mlir::Location location, mlir::Type type) {
 
 class ReductionProcessor {
 public:
-  enum IntrinsicProc { MAX, MIN, IAND, IOR, IEOR };
-  static IntrinsicProc
+  // TODO: Move this enumeration to the OpenMP dialect
+  enum ReductionIdentifier {
+    ID,
+    USER_DEF_OP,
+    ADD,
+    SUBTRACT,
+    MULTIPLY,
+    AND,
+    OR,
+    EQV,
+    NEQV,
+    MAX,
+    MIN,
+    IAND,
+    IOR,
+    IEOR
+  };
+  static ReductionIdentifier
   getReductionType(const Fortran::parser::ProcedureDesignator &pd) {
-    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
+    auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>(
                        getRealName(pd).ToString())
-                       .Case("max", IntrinsicProc::MAX)
-                       .Case("min", IntrinsicProc::MIN)
-                       .Case("iand", IntrinsicProc::IAND)
-                       .Case("ior", IntrinsicProc::IOR)
-                       .Case("ieor", IntrinsicProc::IEOR)
+                       .Case("max", ReductionIdentifier::MAX)
+                       .Case("min", ReductionIdentifier::MIN)
+                       .Case("iand", ReductionIdentifier::IAND)
+                       .Case("ior", ReductionIdentifier::IOR)
+                       .Case("ieor", ReductionIdentifier::IEOR)
                        .Default(std::nullopt);
     assert(redType && "Invalid Reduction");
     return *redType;
   }
 
+  static ReductionIdentifier getReductionType(
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) {
+    switch (intrinsicOp) {
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+      return ReductionIdentifier::ADD;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract:
+      return ReductionIdentifier::SUBTRACT;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+      return ReductionIdentifier::MULTIPLY;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+      return ReductionIdentifier::AND;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+      return ReductionIdentifier::EQV;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+      return ReductionIdentifier::OR;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      return ReductionIdentifier::NEQV;
+    default:
+      llvm_unreachable("unexpected intrinsic operator in reduction");
+    }
+  }
+
   static bool supportedIntrinsicProcReduction(
       const Fortran::parser::ProcedureDesignator &pd) {
     const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
@@ -753,17 +792,14 @@ public:
     if (!name->symbol->GetUltimate().attrs().test(
             Fortran::semantics::Attr::INTRINSIC))
       return false;
-    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
-                       getRealName(name).ToString())
-                       .Case("max", IntrinsicProc::MAX)
-                       .Case("min", IntrinsicProc::MIN)
-                       .Case("iand", IntrinsicProc::IAND)
-                       .Case("ior", IntrinsicProc::IOR)
-                       .Case("ieor", IntrinsicProc::IEOR)
-                       .Default(std::nullopt);
-    if (redType)
-      return true;
-    return false;
+    auto redType = llvm::StringSwitch<bool>(getRealName(name).ToString())
+                       .Case("max", true)
+                       .Case("min", true)
+                       .Case("iand", true)
+                       .Case("ior", true)
+                       .Case("ieor", true)
+                       .Default(false);
+    return redType;
   }
 
   static const Fortran::semantics::SourceName
@@ -817,32 +853,30 @@ public:
   /// reductionOpName. For example:
   ///    0 + x = x,
   ///    1 * x = x
-  static int getOperationIdentity(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Location loc) {
-    switch (intrinsicOp) {
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  static int getOperationIdentity(ReductionIdentifier redId,
+                                  mlir::Location loc) {
+    switch (redId) {
+    case ReductionIdentifier::ADD:
+    case ReductionIdentifier::OR:
+    case ReductionIdentifier::NEQV:
       return 0;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+    case ReductionIdentifier::MULTIPLY:
+    case ReductionIdentifier::AND:
+    case ReductionIdentifier::EQV:
       return 1;
     default:
       TODO(loc, "Reduction of some intrinsic operators is not supported");
     }
   }
 
-  static mlir::Value getIntrinsicProcInitValue(
-      mlir::Location loc, mlir::Type type,
-      const Fortran::parser::ProcedureDesignator &procDesignator,
-      fir::FirOpBuilder &builder) {
+  static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type,
+                                           ReductionIdentifier redId,
+                                           fir::FirOpBuilder &builder) {
     assert((fir::isa_integer(type) || fir::isa_real(type) ||
             type.isa<fir::LogicalType>()) &&
            "only integer, logical and real types are currently supported");
-    switch (getReductionType(procDesignator)) {
-    case IntrinsicProc::MAX: {
+    switch (redId) {
+    case ReductionIdentifier::MAX: {
       if (auto ty = type.dyn_cast<mlir::FloatType>()) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
@@ -852,7 +886,7 @@ public:
       int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, minInt);
     }
-    case IntrinsicProc::MIN: {
+    case ReductionIdentifier::MIN: {
       if (auto ty = type.dyn_cast<mlir::FloatType>()) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
@@ -862,46 +896,50 @@ public:
       int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, maxInt);
     }
-    case IntrinsicProc::IOR: {
+    case ReductionIdentifier::IOR: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, zeroInt);
     }
-    case IntrinsicProc::IEOR: {
+    case ReductionIdentifier::IEOR: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, zeroInt);
     }
-    case IntrinsicProc::IAND: {
+    case ReductionIdentifier::IAND: {
       unsigned bits = type.getIntOrFloatBitWidth();
       int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue();
       return builder.createIntegerConstant(loc, type, allOnInt);
     }
-    }
-    llvm_unreachable("Unknown Reduction Intrinsic");
-  }
+    case ReductionIdentifier::ADD:
+    case ReductionIdentifier::MULTIPLY:
+    case ReductionIdentifier::AND:
+    case ReductionIdentifier::OR:
+    case ReductionIdentifier::EQV:
+    case ReductionIdentifier::NEQV:
+      if (type.isa<mlir::FloatType>())
+        return builder.create<mlir::arith::ConstantOp>(
+            loc, type,
+            builder.getFloatAttr(type,
+                                 (double)getOperationIdentity(redId, loc)));
+
+      if (type.isa<fir::LogicalType>()) {
+        mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
+            loc, builder.getI1Type(),
+            builder.getIntegerAttr(builder.getI1Type(),
+                                   getOperationIdentity(redId, loc)));
+        return builder.createConvert(loc, type, intConst);
+      }
 
-  static mlir::Value getIntrinsicOpInitValue(
-      mlir::Location loc, mlir::Type type,
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      fir::FirOpBuilder &builder) {
-    if (type.isa<mlir::FloatType>())
       return builder.create<mlir::arith::ConstantOp>(
           loc, type,
-          builder.getFloatAttr(type,
-                               (double)getOperationIdentity(intrinsicOp, loc)));
-
-    if (type.isa<fir::LogicalType>()) {
-      mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
-          loc, builder.getI1Type(),
-          builder.getIntegerAttr(builder.getI1Type(),
-                                 getOperationIdentity(intrinsicOp, loc)));
-      return builder.createConvert(loc, type, intConst);
+          builder.getIntegerAttr(type, getOperationIdentity(redId, loc)));
+    case ReductionIdentifier::ID:
+    case ReductionIdentifier::USER_DEF_OP:
+    case ReductionIdentifier::SUBTRACT:
+      TODO(loc, "Reduction of some identifier types is not supported");
     }
-
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, type,
-        builder.getIntegerAttr(type, getOperationIdentity(intrinsicOp, loc)));
+    llvm_unreachable("Unhandled Reduction identifier : getReductionInitValue");
   }
 
   template <typename FloatOp, typename IntegerOp>
@@ -915,118 +953,46 @@ public:
     return builder.create<FloatOp>(loc, op1, op2);
   }
 
-  /// Creates an OpenMP reduction declaration and inserts it into the provided
-  /// symbol table. The declaration has a constant initializer with the neutral
-  /// value `initValue`, and the reduction combiner carried over from `reduce`.
-  /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      const Fortran::parser::ProcedureDesignator &procDesignator,
-      mlir::Type type, mlir::Location loc) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    mlir::ModuleOp module = builder.getModule();
-
-    auto decl =
-        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-    if (decl)
-      return decl;
-
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-
-    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
-        loc, reductionOpName, type);
-    builder.createBlock(&decl.getInitializerRegion(),
-                        decl.getInitializerRegion().end(), {type}, {loc});
-    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
-    mlir::Value init =
-        getIntrinsicProcInitValue(loc, type, procDesignator, builder);
-    builder.create<mlir::omp::YieldOp>(loc, init);
-
-    builder.createBlock(&decl.getReductionRegion(),
-                        decl.getReductionRegion().end(), {type, type},
-                        {loc, loc});
-
-    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-
+  static mlir::Value createScalarCombiner(fir::FirOpBuilder &builder,
+                                          mlir::Location loc,
+                                          ReductionIdentifier redId,
+                                          mlir::Type type, mlir::Value op1,
+                                          mlir::Value op2) {
     mlir::Value reductionOp;
-    switch (getReductionType(procDesignator)) {
-    case IntrinsicProc::MAX:
+    switch (redId) {
+    case ReductionIdentifier::MAX:
       reductionOp =
           getReductionOperation<mlir::arith::MaximumFOp, mlir::arith::MaxSIOp>(
               builder, type, loc, op1, op2);
       break;
-    case IntrinsicProc::MIN:
+    case ReductionIdentifier::MIN:
       reductionOp =
           getReductionOperation<mlir::arith::MinimumFOp, mlir::arith::MinSIOp>(
               builder, type, loc, op1, op2);
       break;
-    case IntrinsicProc::IOR:
+    case ReductionIdentifier::IOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::OrIOp>(loc, op1, op2);
       break;
-    case IntrinsicProc::IEOR:
+    case ReductionIdentifier::IEOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::XOrIOp>(loc, op1, op2);
       break;
-    case IntrinsicProc::IAND:
+    case ReductionIdentifier::IAND:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::AndIOp>(loc, op1, op2);
       break;
-    }
-
-    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
-    return decl;
-  }
-
-  /// Creates an OpenMP reduction declaration and inserts it into the provided
-  /// symbol table. The declaration has a constant initializer with the neutral
-  /// value `initValue`, and the reduction combiner carried over from `reduce`.
-  /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type type, mlir::Location loc) {
-    mlir::OpBuilder::InsertionGuard guard(builder);
-    mlir::ModuleOp module = builder.getModule();
-
-    auto decl =
-        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-    if (decl)
-      return decl;
-
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-
-    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
-        loc, reductionOpName, type);
-    builder.createBlock(&decl.getInitializerRegion(),
-                        decl.getInitializerRegion().end(), {type}, {loc});
-    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
-    mlir::Value init = getIntrinsicOpInitValue(loc, type, intrinsicOp, builder);
-    builder.create<mlir::omp::YieldOp>(loc, init);
-
-    builder.createBlock(&decl.getReductionRegion(),
-                        decl.getReductionRegion().end(), {type, type},
-                        {loc, loc});
-
-    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-
-    mlir::Value reductionOp;
-    switch (intrinsicOp) {
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+    case ReductionIdentifier::ADD:
       reductionOp =
           getReductionOperation<mlir::arith::AddFOp, mlir::arith::AddIOp>(
               builder, type, loc, op1, op2);
       break;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+    case ReductionIdentifier::MULTIPLY:
       reductionOp =
           getReductionOperation<mlir::arith::MulFOp, mlir::arith::MulIOp>(
               builder, type, loc, op1, op2);
       break;
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: {
+    case ReductionIdentifier::AND: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1036,7 +1002,7 @@ public:
       reductionOp = builder.createConvert(loc, type, andiOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: {
+    case ReductionIdentifier::OR: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1045,7 +1011,7 @@ public:
       reductionOp = builder.createConvert(loc, type, oriOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: {
+    case ReductionIdentifier::EQV: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1055,7 +1021,7 @@ public:
       reductionOp = builder.createConvert(loc, type, cmpiOp);
       break;
     }
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: {
+    case ReductionIdentifier::NEQV: {
       mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
       mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
@@ -1069,18 +1035,59 @@ public:
       TODO(loc, "Reduction of some intrinsic operators is not supported");
     }
 
+    return reductionOp;
+  }
+
+  /// Creates an OpenMP reduction declaration and inserts it into the provided
+  /// symbol table. The declaration has a constant initializer with the neutral
+  /// value `initValue`, and the reduction combiner carried over from `reduce`.
+  /// TODO: Generalize this for non-integer types, add atomic region.
+  static mlir::omp::ReductionDeclareOp createReductionDecl(
+      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
+      const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::ModuleOp module = builder.getModule();
+
+    auto decl =
+        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
+    if (decl)
+      return decl;
+
+    mlir::OpBuilder modBuilder(module.getBodyRegion());
+
+    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
+        loc, reductionOpName, type);
+    builder.createBlock(&decl.getInitializerRegion(),
+                        decl.getInitializerRegion().end(), {type}, {loc});
+    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+    mlir::Value init = getReductionInitValue(loc, type, redId, builder);
+    builder.create<mlir::omp::YieldOp>(loc, init);
+
+    builder.createBlock(&decl.getReductionRegion(),
+                        decl.getReductionRegion().end(), {type, type},
+                        {loc, loc});
+
+    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
+    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
+    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+
+    mlir::Value reductionOp =
+        createScalarCombiner(builder, loc, redId, type, op1, op2);
     builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+
     return decl;
   }
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
-  static void addReductionDecl(
-      mlir::Location currentLocation,
-      Fortran::lower::AbstractConverter &converter,
-      const Fortran::parser::OmpReductionClause &reduction,
-      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-      llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) {
+  static void
+  addReductionDecl(mlir::Location currentLocation,
+                   Fortran::lower::AbstractConverter &converter,
+                   const Fortran::parser::OmpReductionClause &reduction,
+                   llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+                   llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+                   llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+                       *reductionSymbols = nullptr) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::omp::ReductionDeclareOp decl;
     const auto &redOperator{
@@ -1092,15 +1099,15 @@ public:
       const auto &intrinsicOp{
           std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
               redDefinedOp->u)};
-      switch (intrinsicOp) {
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-      case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      ReductionIdentifier redId = getReductionType(intrinsicOp);
+      switch (redId) {
+      case ReductionIdentifier::ADD:
+      case ReductionIdentifier::MULTIPLY:
+      case ReductionIdentifier::AND:
+      case ReductionIdentifier::EQV:
+      case ReductionIdentifier::OR:
+      case ReductionIdentifier::NEQV:
         break;
-
       default:
         TODO(currentLocation,
              "Reduction of some intrinsic operators is not supported");
@@ -1110,6 +1117,8 @@ public:
         if (const auto *name{
                 Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
           if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+            if (reductionSymbols)
+              reductionSymbols->push_back(symbol);
             mlir::Value symVal = converter.getSymbolAddress(*symbol);
             if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
               symVal = declOp.getBase();
@@ -1120,11 +1129,11 @@ public:
               decl = createReductionDecl(
                   firOpBuilder,
                   getReductionName(intrinsicOp, firOpBuilder.getI1Type()),
-                  intrinsicOp, redType, currentLocation);
+                  redId, redType, currentLocation);
             else if (redType.isIntOrIndexOrFloat()) {
               decl = createReductionDecl(firOpBuilder,
                                          getReductionName(intrinsicOp, redType),
-                                         intrinsicOp, redType, currentLocation);
+                                         redId, redType, currentLocation);
             } else {
               TODO(currentLocation, "Reduction of some types is not supported");
             }
@@ -1138,10 +1147,14 @@ public:
                        &redOperator.u)) {
       if (ReductionProcessor::supportedIntrinsicProcReduction(
               *reductionIntrinsic)) {
+        ReductionProcessor::ReductionIdentifier redId =
+            ReductionProcessor::getReductionType(*reductionIntrinsic);
         for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
           if (const auto *name{
                   Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
             if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+              if (reductionSymbols)
+                reductionSymbols->push_back(symbol);
               mlir::Value symVal = converter.getSymbolAddress(*symbol);
               if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
                 symVal = declOp.getBase();
@@ -1154,7 +1167,7 @@ public:
                   firOpBuilder,
                   getReductionName(getRealName(*reductionIntrinsic).ToString(),
                                    redType),
-                  *reductionIntrinsic, redType, currentLocation);
+                  redId, redType, currentLocation);
               reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
                   firOpBuilder.getContext(), decl.getSymName()));
             }
@@ -1845,7 +1858,6 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
 bool ClauseProcessor::processMap(
     mlir::Location currentLocation, const llvm::omp::Directive &directive,
-    Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes,
@@ -1907,7 +1919,7 @@ bool ClauseProcessor::processMap(
               Fortran::lower::gatherDataOperandAddrAndBounds<
                   Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
                   mlir::omp::DataBoundsType>(
-                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  converter, firOpBuilder, semaCtx, stmtCtx, ompObject,
                   clauseLocation, asFortran, bounds, treatIndexAsSection);
 
           auto origSymbol =
@@ -1942,13 +1954,16 @@ bool ClauseProcessor::processMap(
 bool ClauseProcessor::processReduction(
     mlir::Location currentLocation,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) const {
+    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols)
+    const {
   return findRepeatableClause<ClauseTy::Reduction>(
       [&](const ClauseTy::Reduction *reductionClause,
           const Fortran::parser::CharBlock &) {
         ReductionProcessor rp;
         rp.addReductionDecl(currentLocation, converter, reductionClause->v,
-                            reductionVars, reductionDeclSymbols);
+                            reductionVars, reductionDeclSymbols,
+                            reductionSymbols);
       });
 }
 
@@ -2012,7 +2027,6 @@ bool ClauseProcessor::processUseDevicePtr(
 
 template <typename T>
 bool ClauseProcessor::processMotionClauses(
-    Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
   return findRepeatableClause<T>(
@@ -2036,7 +2050,7 @@ bool ClauseProcessor::processMotionClauses(
               Fortran::lower::gatherDataOperandAddrAndBounds<
                   Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
                   mlir::omp::DataBoundsType>(
-                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  converter, firOpBuilder, semaCtx, stmtCtx, ompObject,
                   clauseLocation, asFortran, bounds, treatIndexAsSection);
 
           auto origSymbol =
@@ -2275,8 +2289,9 @@ struct OpWithBodyGenInfo {
           mlir::Operation *)>;
 
   OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
+                    Fortran::semantics::SemanticsContext &semaCtx,
                     mlir::Location loc, Fortran::lower::pft::Evaluation &eval)
-      : converter(converter), loc(loc), eval(eval) {}
+      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {}
 
   OpWithBodyGenInfo &setGenNested(bool value) {
     genNested = value;
@@ -2298,6 +2313,14 @@ struct OpWithBodyGenInfo {
     return *this;
   }
 
+  OpWithBodyGenInfo &
+  setReductions(llvm::SmallVector<const Fortran::semantics::Symbol *> *value1,
+                llvm::SmallVector<mlir::Type> *value2) {
+    reductionSymbols = value1;
+    reductionTypes = value2;
+    return *this;
+  }
+
   OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
     genRegionEntryCB = value;
     return *this;
@@ -2305,6 +2328,8 @@ struct OpWithBodyGenInfo {
 
   /// [inout] converter to use for the clauses.
   Fortran::lower::AbstractConverter &converter;
+  /// [in] Semantics context
+  Fortran::semantics::SemanticsContext &semaCtx;
   /// [in] location in source code.
   mlir::Location loc;
   /// [in] current PFT node/evaluation.
@@ -2317,6 +2342,11 @@ struct OpWithBodyGenInfo {
   const Fortran::parser::OmpClauseList *clauses = nullptr;
   /// [in] if provided, processes the construct's data-sharing attributes.
   DataSharingProcessor *dsp = nullptr;
+  /// [in] if provided, list of reduction symbols
+  llvm::SmallVector<const Fortran::semantics::Symbol *> *reductionSymbols =
+      nullptr;
+  /// [in] if provided, list of reduction types
+  llvm::SmallVector<mlir::Type> *reductionTypes = nullptr;
   /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
   /// is created in the region.
   GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
@@ -2378,7 +2408,8 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     threadPrivatizeVars(info.converter, info.eval);
     if (info.clauses) {
       firOpBuilder.setInsertionPoint(marker);
-      ClauseProcessor(info.converter, *info.clauses).processCopyin();
+      ClauseProcessor(info.converter, info.semaCtx, *info.clauses)
+          .processCopyin();
     }
   }
 
@@ -2458,6 +2489,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
 
 static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     mlir::omp::DataOp &dataOp,
     const llvm::SmallVector<mlir::Type> &useDeviceTypes,
@@ -2531,26 +2563,29 @@ static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
 
 static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation) {
   return genOpWithBody<mlir::omp::MasterOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested),
       /*resultTypes=*/mlir::TypeRange());
 }
 
 static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval, bool genNested,
                    mlir::Location currentLocation) {
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested),
       /*simd=*/false);
 }
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
               const Fortran::parser::OmpClauseList &clauseList,
@@ -2561,8 +2596,9 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands,
       reductionVars;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Parallel,
                ifClauseOperand);
   cp.processNumThreads(stmtCtx, numThreadsClauseOperand);
@@ -2570,13 +2606,33 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   cp.processDefault();
   cp.processAllocate(allocatorOperands, allocateOperands);
   if (!outerCombined)
-    cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols);
+    cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols,
+                        &reductionSymbols);
+
+  llvm::SmallVector<mlir::Type> reductionTypes;
+  reductionTypes.reserve(reductionVars.size());
+  llvm::transform(reductionVars, std::back_inserter(reductionTypes),
+                  [](mlir::Value v) { return v.getType(); });
+
+  auto reductionCallback = [&](mlir::Operation *op) {
+    llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
+                                           currentLocation);
+    auto block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
+                                                         reductionTypes, locs);
+    for (auto [arg, prv] :
+         llvm::zip_equal(reductionSymbols, block->getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+    return reductionSymbols;
+  };
 
   return genOpWithBody<mlir::omp::ParallelOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauseList),
+          .setClauses(&clauseList)
+          .setReductions(&reductionSymbols, &reductionTypes)
+          .setGenRegionEntryCb(reductionCallback),
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -2589,19 +2645,21 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SectionOp
 genSectionOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, bool genNested,
              mlir::Location currentLocation,
              const Fortran::parser::OmpClauseList &sectionsClauseList) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&sectionsClauseList));
 }
 
 static mlir::omp::SingleOp
 genSingleOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location currentLocation,
             const Fortran::parser::OmpClauseList &beginClauseList,
@@ -2609,15 +2667,15 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands;
   mlir::UnitAttr nowaitAttr;
 
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processTODO<Fortran::parser::OmpClause::Copyprivate>(
       currentLocation, llvm::omp::Directive::OMPD_single);
 
-  ClauseProcessor(converter, endClauseList).processNowait(nowaitAttr);
+  ClauseProcessor(converter, semaCtx, endClauseList).processNowait(nowaitAttr);
 
   return genOpWithBody<mlir::omp::SingleOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&beginClauseList),
       allocateOperands, allocatorOperands, nowaitAttr);
@@ -2625,6 +2683,7 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskOp
 genTaskOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, bool genNested,
           mlir::Location currentLocation,
           const Fortran::parser::OmpClauseList &clauseList) {
@@ -2635,7 +2694,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands,
       dependOperands;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Task,
                ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
@@ -2651,7 +2710,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_task);
 
   return genOpWithBody<mlir::omp::TaskOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&clauseList),
       ifClauseOperand, finalClauseOperand, untiedAttr, mergeableAttr,
@@ -2666,16 +2725,17 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskGroupOp
 genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, bool genNested,
                mlir::Location currentLocation,
                const Fortran::parser::OmpClauseList &clauseList) {
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands;
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processTODO<Fortran::parser::OmpClause::TaskReduction>(
       currentLocation, llvm::omp::Directive::OMPD_taskgroup);
   return genOpWithBody<mlir::omp::TaskGroupOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setClauses(&clauseList),
       /*task_reduction_vars=*/mlir::ValueRange(),
@@ -2684,9 +2744,9 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::DataOp
 genDataOp(Fortran::lower::AbstractConverter &converter,
-          Fortran::lower::pft::Evaluation &eval,
-          Fortran::semantics::SemanticsContext &semanticsContext,
-          bool genNested, mlir::Location currentLocation,
+          Fortran::semantics::SemanticsContext &semaCtx,
+          Fortran::lower::pft::Evaluation &eval, bool genNested,
+          mlir::Location currentLocation,
           const Fortran::parser::OmpClauseList &clauseList) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::Value ifClauseOperand, deviceOperand;
@@ -2696,7 +2756,7 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetData,
                ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
@@ -2705,20 +2765,21 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   cp.processUseDeviceAddr(deviceAddrOperands, useDeviceTypes, useDeviceLocs,
                           useDeviceSymbols);
   cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data,
-                semanticsContext, stmtCtx, mapOperands);
+                stmtCtx, mapOperands);
 
   auto dataOp = converter.getFirOpBuilder().create<mlir::omp::DataOp>(
       currentLocation, ifClauseOperand, deviceOperand, devicePtrOperands,
       deviceAddrOperands, mapOperands);
-  genBodyOfTargetDataOp(converter, eval, genNested, dataOp, useDeviceTypes,
-                        useDeviceLocs, useDeviceSymbols, currentLocation);
+  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, dataOp,
+                        useDeviceTypes, useDeviceLocs, useDeviceSymbols,
+                        currentLocation);
   return dataOp;
 }
 
 template <typename OpTy>
 static OpTy
 genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
-                         Fortran::semantics::SemanticsContext &semanticsContext,
+                         Fortran::semantics::SemanticsContext &semaCtx,
                          mlir::Location currentLocation,
                          const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -2745,33 +2806,34 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
     return nullptr;
   }
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(directiveName, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processNowait(nowaitAttr);
 
   if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
-    cp.processMotionClauses<Fortran::parser::OmpClause::To>(
-        semanticsContext, stmtCtx, mapOperands);
-    cp.processMotionClauses<Fortran::parser::OmpClause::From>(
-        semanticsContext, stmtCtx, mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::To>(stmtCtx,
+                                                            mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::From>(stmtCtx,
+                                                              mapOperands);
 
   } else {
-    cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                  mapOperands);
+    cp.processMap(currentLocation, directive, stmtCtx, mapOperands);
   }
 
   cp.processTODO<Fortran::parser::OmpClause::Depend>(currentLocation,
                                                      directive);
 
   return firOpBuilder.create<OpTy>(currentLocation, ifClauseOperand,
-                                   deviceOperand, nowaitAttr, mapOperands);
+                                   deviceOperand, nullptr, mlir::ValueRange(),
+                                   nowaitAttr, mapOperands);
 }
 
 // This functions creates a block for the body of the targetOp's region. It adds
 // all the symbols present in mapSymbols as block arguments to this block.
 static void genBodyOfTargetOp(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     mlir::omp::TargetOp &targetOp,
     const llvm::SmallVector<mlir::Type> &mapSymTypes,
@@ -2923,9 +2985,9 @@ static void genBodyOfTargetOp(
 
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::lower::pft::Evaluation &eval,
-            Fortran::semantics::SemanticsContext &semanticsContext,
-            bool genNested, mlir::Location currentLocation,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location currentLocation,
             const Fortran::parser::OmpClauseList &clauseList,
             llvm::omp::Directive directive, bool outerCombined = false) {
   Fortran::lower::StatementContext stmtCtx;
@@ -2936,14 +2998,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Location> mapSymLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Target,
                ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processThreadLimit(stmtCtx, threadLimitOperand);
   cp.processNowait(nowaitAttr);
-  cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                mapOperands, &mapSymTypes, &mapSymLocs, &mapSymbols);
+  cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes,
+                &mapSymLocs, &mapSymbols);
   cp.processTODO<Fortran::parser::OmpClause::Private,
                  Fortran::parser::OmpClause::Depend,
                  Fortran::parser::OmpClause::Firstprivate,
@@ -3029,9 +3091,9 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
   auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
       currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand,
-      nowaitAttr, mapOperands);
+      nullptr, mlir::ValueRange(), nowaitAttr, mapOperands);
 
-  genBodyOfTargetOp(converter, eval, genNested, targetOp, mapSymTypes,
+  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes,
                     mapSymLocs, mapSymbols, currentLocation);
 
   return targetOp;
@@ -3039,6 +3101,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TeamsOp
 genTeamsOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, bool genNested,
            mlir::Location currentLocation,
            const Fortran::parser::OmpClauseList &clauseList,
@@ -3049,7 +3112,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
       reductionVars;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
 
-  ClauseProcessor cp(converter, clauseList);
+  ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Teams,
                ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
@@ -3060,7 +3123,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
       currentLocation, llvm::omp::Directive::OMPD_teams);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval)
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList),
@@ -3077,6 +3140,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 /// 'declare target' directive and return the intended device type for them.
 static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
@@ -3102,7 +3166,7 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
           eval.getOwningProcedure()->getSubprogramSymbol());
     }
 
-    ClauseProcessor cp(converter, *clauseList);
+    ClauseProcessor cp(converter, semaCtx, *clauseList);
     cp.processTo(symbolAndClause);
     cp.processEnter(symbolAndClause);
     cp.processLink(symbolAndClause);
@@ -3118,12 +3182,13 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 static std::optional<mlir::omp::DeclareTargetDeviceType>
 getDeclareTargetFunctionDevice(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclareTargetConstruct
         &declareTargetConstruct) {
   llvm::SmallVector<DeclareTargetCapturePair, 0> symbolAndClause;
   mlir::omp::DeclareTargetDeviceType deviceType = getDeclareTargetInfo(
-      converter, eval, declareTargetConstruct, symbolAndClause);
+      converter, semaCtx, eval, declareTargetConstruct, symbolAndClause);
 
   // Return the device type only if at least one of the targets for the
   // directive is a function or subroutine
@@ -3145,9 +3210,8 @@ getDeclareTargetFunctionDevice(
 
 static void
 genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
-                       Fortran::lower::pft::Evaluation &eval,
-                       Fortran::semantics::SemanticsContext &semanticsContext,
-                       bool genNested,
+                       Fortran::semantics::SemanticsContext &semaCtx,
+                       Fortran::lower::pft::Evaluation &eval, bool genNested,
                        const Fortran::parser::OpenMPSimpleStandaloneConstruct
                            &simpleStandaloneConstruct) {
   const auto &directive =
@@ -3165,7 +3229,7 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     firOpBuilder.create<mlir::omp::BarrierOp>(currentLocation);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    ClauseProcessor(converter, opClauseList)
+    ClauseProcessor(converter, semaCtx, opClauseList)
         .processTODO<Fortran::parser::OmpClause::Depend,
                      Fortran::parser::OmpClause::Nowait>(
             currentLocation, llvm::omp::Directive::OMPD_taskwait);
@@ -3175,20 +3239,20 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     firOpBuilder.create<mlir::omp::TaskyieldOp>(currentLocation);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genDataOp(converter, eval, semanticsContext, genNested, currentLocation,
+    genDataOp(converter, semaCtx, eval, genNested, currentLocation,
               opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genEnterExitUpdateDataOp<mlir::omp::EnterDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genEnterExitUpdateDataOp<mlir::omp::ExitDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genEnterExitUpdateDataOp<mlir::omp::UpdateDataOp>(
-        converter, semanticsContext, currentLocation, opClauseList);
+        converter, semaCtx, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_ordered:
     TODO(currentLocation, "OMPD_ordered");
@@ -3197,6 +3261,7 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
 
 static void
 genOmpFlush(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval,
             const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
   llvm::SmallVector<mlir::Value, 4> operandRange;
@@ -3216,19 +3281,19 @@ genOmpFlush(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct
                   &simpleStandaloneConstruct) {
-            genOmpSimpleStandalone(converter, eval, semanticsContext,
+            genOmpSimpleStandalone(converter, semaCtx, eval,
                                    /*genNested=*/true,
                                    simpleStandaloneConstruct);
           },
           [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-            genOmpFlush(converter, eval, flushConstruct);
+            genOmpFlush(converter, semaCtx, eval, flushConstruct);
           },
           [&](const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
             TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
@@ -3289,6 +3354,7 @@ genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
 
 static void
 createSimdLoop(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval,
                llvm::omp::Directive ompDirective,
                const Fortran::parser::OmpClauseList &loopOpClauseList,
@@ -3307,7 +3373,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   mlir::IntegerAttr simdlenClauseOperand, safelenClauseOperand;
   std::size_t loopVarTypeSize;
 
-  ClauseProcessor cp(converter, loopOpClauseList);
+  ClauseProcessor cp(converter, semaCtx, loopOpClauseList);
   cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
@@ -3340,13 +3406,14 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp<mlir::omp::SimdLoopOp>(
-      simdLoopOp, OpWithBodyGenInfo(converter, loc, *nestedEval)
+      simdLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
                       .setClauses(&loopOpClauseList)
                       .setDataSharingProcessor(&dsp)
                       .setGenRegionEntryCb(ivCallback));
 }
 
 static void createWsLoop(Fortran::lower::AbstractConverter &converter,
+                         Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
                          llvm::omp::Directive ompDirective,
                          const Fortran::parser::OmpClauseList &beginClauseList,
@@ -3369,7 +3436,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   mlir::omp::ScheduleModifierAttr scheduleModClauseOperand;
   std::size_t loopVarTypeSize;
 
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
@@ -3409,7 +3476,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   // <...>
   // !$omp end do nowait
   if (endClauseList) {
-    if (ClauseProcessor(converter, *endClauseList)
+    if (ClauseProcessor(converter, semaCtx, *endClauseList)
             .processNowait(nowaitClauseOperand))
       wsLoopOp.setNowaitAttr(nowaitClauseOperand);
   }
@@ -3422,7 +3489,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp<mlir::omp::WsLoopOp>(
-      wsLoopOp, OpWithBodyGenInfo(converter, loc, *nestedEval)
+      wsLoopOp, OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
                     .setClauses(&beginClauseList)
                     .setDataSharingProcessor(&dsp)
                     .setGenRegionEntryCb(ivCallback));
@@ -3430,10 +3497,11 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
 
 static void createSimdWsLoop(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective,
     const Fortran::parser::OmpClauseList &beginClauseList,
     const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
-  ClauseProcessor cp(converter, beginClauseList);
+  ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processTODO<
       Fortran::parser::OmpClause::Aligned, Fortran::parser::OmpClause::Allocate,
       Fortran::parser::OmpClause::Linear, Fortran::parser::OmpClause::Safelen,
@@ -3447,13 +3515,13 @@ static void createSimdWsLoop(
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  createWsLoop(converter, eval, ompDirective, beginClauseList, endClauseList,
-               loc);
+  createWsLoop(converter, semaCtx, eval, ompDirective, beginClauseList,
+               endClauseList, loc);
 }
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
   const auto &beginLoopDirective =
@@ -3485,14 +3553,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genTargetOp(converter, eval, semanticsContext, /*genNested=*/false,
+      genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
                   currentLocation, loopOpClauseList, ompDirective,
                   /*outerCombined=*/true);
     }
     if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genTeamsOp(converter, eval, /*genNested=*/false, currentLocation,
+      genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
                  loopOpClauseList,
                  /*outerCombined=*/true);
     }
@@ -3503,8 +3571,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                    loopOpClauseList,
+      genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+                    currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
   }
@@ -3519,25 +3587,25 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
   if (llvm::omp::allDoSimdSet.test(ompDirective)) {
     // 2.9.3.2 Workshare SIMD construct
-    createSimdWsLoop(converter, eval, ompDirective, loopOpClauseList,
+    createSimdWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
                      endClauseList, currentLocation);
 
   } else if (llvm::omp::allSimdSet.test(ompDirective)) {
     // 2.9.3.1 SIMD construct
-    createSimdLoop(converter, eval, ompDirective, loopOpClauseList,
+    createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
                    currentLocation);
   } else {
-    createWsLoop(converter, eval, ompDirective, loopOpClauseList, endClauseList,
-                 currentLocation);
+    createWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
+                 endClauseList, currentLocation);
   }
 
-  genOpenMPReduction(converter, loopOpClauseList);
+  genOpenMPReduction(converter, semaCtx, loopOpClauseList);
 }
 
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPBlockConstruct &blockConstruct) {
   const auto &beginBlockDirective =
@@ -3586,37 +3654,38 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   mlir::Location currentLocation = converter.genLocation(directive.source);
   switch (directive.v) {
   case llvm::omp::Directive::OMPD_master:
-    genMasterOp(converter, eval, /*genNested=*/true, currentLocation);
+    genMasterOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_ordered:
-    genOrderedRegionOp(converter, eval, /*genNested=*/true, currentLocation);
+    genOrderedRegionOp(converter, semaCtx, eval, /*genNested=*/true,
+                       currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, eval, /*genNested=*/true, currentLocation,
+    genParallelOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                   beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
-    genSingleOp(converter, eval, /*genNested=*/true, currentLocation,
+    genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                 beginClauseList, endClauseList);
     break;
   case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, eval, semanticsContext, /*genNested=*/true,
-                currentLocation, beginClauseList, directive.v);
+    genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+                beginClauseList, directive.v);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genDataOp(converter, eval, semanticsContext, /*genNested=*/true,
-              currentLocation, beginClauseList);
+    genDataOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+              beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_task:
-    genTaskOp(converter, eval, /*genNested=*/true, currentLocation,
+    genTaskOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
               beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_taskgroup:
-    genTaskGroupOp(converter, eval, /*genNested=*/true, currentLocation,
-                   beginClauseList);
+    genTaskGroupOp(converter, semaCtx, eval, /*genNested=*/true,
+                   currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_teams:
-    genTeamsOp(converter, eval, /*genNested=*/true, currentLocation,
+    genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
                beginClauseList,
                /*outerCombined=*/false);
     break;
@@ -3628,23 +3697,21 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     break;
   }
 
-  if (singleDirective) {
-    genOpenMPReduction(converter, beginClauseList);
+  if (singleDirective)
     return;
-  }
 
   // Codegen for combined directives
   bool combinedDirective = false;
   if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet)
           .test(directive.v)) {
-    genTargetOp(converter, eval, semanticsContext, /*genNested=*/false,
-                currentLocation, beginClauseList, directive.v,
+    genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
+                beginClauseList, directive.v,
                 /*outerCombined=*/true);
     combinedDirective = true;
   }
   if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet)
           .test(directive.v)) {
-    genTeamsOp(converter, eval, /*genNested=*/false, currentLocation,
+    genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
                beginClauseList);
     combinedDirective = true;
   }
@@ -3652,8 +3719,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, eval, /*genNested=*/false, currentLocation,
-                  beginClauseList, outerCombined);
+    genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+                  currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
   if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet)
@@ -3667,13 +3734,12 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                               ")");
 
   genNestedEvaluations(converter, eval);
-  genOpenMPReduction(converter, beginClauseList);
 }
 
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPCriticalConstruct &criticalConstruct) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -3688,7 +3754,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   }
 
   const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
-  ClauseProcessor(converter, clauseList).processHint(hintClauseOp);
+  ClauseProcessor(converter, semaCtx, clauseList).processHint(hintClauseOp);
 
   mlir::omp::CriticalOp criticalOp = [&]() {
     if (name.empty()) {
@@ -3706,14 +3772,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         currentLocation, mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
                                                       global.getSymName()));
   }();
-  auto genInfo = OpWithBodyGenInfo(converter, currentLocation, eval);
+  auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval);
   createBodyOfOp<mlir::omp::CriticalOp>(criticalOp, genInfo);
 }
 
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPSectionsConstruct &sectionsConstruct) {
   mlir::Location currentLocation = converter.getCurrentLocation();
@@ -3726,7 +3792,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Process clauses before optional omp.parallel, so that new variables are
   // allocated outside of the parallel region
-  ClauseProcessor cp(converter, sectionsClauseList);
+  ClauseProcessor cp(converter, semaCtx, sectionsClauseList);
   cp.processSectionsReduction(currentLocation);
   cp.processAllocate(allocatorOperands, allocateOperands);
 
@@ -3736,7 +3802,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, eval,
+    genParallelOp(converter, semaCtx, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
@@ -3744,13 +3810,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         std::get<Fortran::parser::OmpEndSectionsDirective>(sectionsConstruct.t);
     const auto &endSectionsClauseList =
         std::get<Fortran::parser::OmpClauseList>(endSectionsDirective.t);
-    ClauseProcessor(converter, endSectionsClauseList)
+    ClauseProcessor(converter, semaCtx, endSectionsClauseList)
         .processNowait(nowaitClauseOperand);
   }
 
   // SECTIONS construct
   genOpWithBody<mlir::omp::SectionsOp>(
-      OpWithBodyGenInfo(converter, currentLocation, eval).setGenNested(false),
+      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
+          .setGenNested(false),
       /*reduction_vars=*/mlir::ValueRange(),
       /*reductions=*/nullptr, allocateOperands, allocatorOperands,
       nowaitClauseOperand);
@@ -3762,7 +3829,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   for (const auto &[nblock, neval] :
        llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
     symTable.pushScope();
-    genSectionOp(converter, neval, /*genNested=*/true, currentLocation,
+    genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation,
                  sectionsClauseList);
     symTable.popScope();
     firOpBuilder.restoreInsertionPoint(ip);
@@ -3772,7 +3839,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) {
   std::visit(
@@ -3817,14 +3884,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPDeclareTargetConstruct
                        &declareTargetConstruct) {
   llvm::SmallVector<DeclareTargetCapturePair, 0> symbolAndClause;
   mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
   mlir::omp::DeclareTargetDeviceType deviceType = getDeclareTargetInfo(
-      converter, eval, declareTargetConstruct, symbolAndClause);
+      converter, semaCtx, eval, declareTargetConstruct, symbolAndClause);
 
   for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
     mlir::Operation *op = mod.lookupSymbol(
@@ -3870,27 +3937,25 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenMPConstruct &ompConstruct) {
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPStandaloneConstruct
                   &standaloneConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   standaloneConstruct);
+            genOMP(converter, symTable, semaCtx, eval, standaloneConstruct);
           },
           [&](const Fortran::parser::OpenMPSectionsConstruct
                   &sectionsConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   sectionsConstruct);
+            genOMP(converter, symTable, semaCtx, eval, sectionsConstruct);
           },
           [&](const Fortran::parser::OpenMPSectionConstruct &sectionConstruct) {
             // SECTION constructs are handled as a part of SECTIONS.
             llvm_unreachable("Unexpected standalone OMP SECTION");
           },
           [&](const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval, loopConstruct);
+            genOMP(converter, symTable, semaCtx, eval, loopConstruct);
           },
           [&](const Fortran::parser::OpenMPDeclarativeAllocate
                   &execAllocConstruct) {
@@ -3905,16 +3970,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
             TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
           },
           [&](const Fortran::parser::OpenMPBlockConstruct &blockConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval, blockConstruct);
+            genOMP(converter, symTable, semaCtx, eval, blockConstruct);
           },
           [&](const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   atomicConstruct);
+            genOMP(converter, symTable, semaCtx, eval, atomicConstruct);
           },
           [&](const Fortran::parser::OpenMPCriticalConstruct
                   &criticalConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   criticalConstruct);
+            genOMP(converter, symTable, semaCtx, eval, criticalConstruct);
           },
       },
       ompConstruct.u);
@@ -3923,7 +3986,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
 static void
 genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semanticsContext,
+       Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPDeclarativeConstruct &ompDeclConstruct) {
   std::visit(
@@ -3943,8 +4006,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           },
           [&](const Fortran::parser::OpenMPDeclareTargetConstruct
                   &declareTargetConstruct) {
-            genOMP(converter, symTable, semanticsContext, eval,
-                   declareTargetConstruct);
+            genOMP(converter, symTable, semaCtx, eval, declareTargetConstruct);
           },
           [&](const Fortran::parser::OpenMPRequiresConstruct
                   &requiresConstruct) {
@@ -3978,21 +4040,21 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
 void Fortran::lower::genOpenMPConstruct(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
-    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPConstruct &omp) {
   symTable.pushScope();
-  genOMP(converter, symTable, semanticsContext, eval, omp);
+  genOMP(converter, symTable, semaCtx, eval, omp);
   symTable.popScope();
 }
 
 void Fortran::lower::genOpenMPDeclarativeConstruct(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
-    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclarativeConstruct &omp) {
-  genOMP(converter, symTable, semanticsContext, eval, omp);
+  genOMP(converter, symTable, semaCtx, eval, omp);
   genNestedEvaluations(converter, eval);
 }
 
@@ -4107,6 +4169,7 @@ void Fortran::lower::genDeclareTargetIntGlobal(
 // ops in the builder (instead of a rewriter) is probably not the best approach.
 void Fortran::lower::genOpenMPReduction(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
@@ -4174,7 +4237,7 @@ void Fortran::lower::genOpenMPReduction(
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic))
           continue;
-        ReductionProcessor::IntrinsicProc redIntrinsicProc =
+        ReductionProcessor::ReductionIdentifier redId =
             ReductionProcessor::getReductionType(*reductionIntrinsic);
         for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
           if (const auto *name{
@@ -4195,10 +4258,8 @@ void Fortran::lower::genOpenMPReduction(
                   if (reductionOp == nullptr)
                     continue;
 
-                  if (redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::MAX ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::MIN) {
+                  if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
+                      redId == ReductionProcessor::ReductionIdentifier::MIN) {
                     assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
                            "Selection Op not found in reduction intrinsic");
                     mlir::Operation *compareOp =
@@ -4206,13 +4267,9 @@ void Fortran::lower::genOpenMPReduction(
                     updateReduction(compareOp, firOpBuilder, loadVal,
                                     reductionVal);
                   }
-                  if (redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IOR ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IEOR ||
-                      redIntrinsicProc ==
-                          ReductionProcessor::IntrinsicProc::IAND) {
-
+                  if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
+                      redId == ReductionProcessor::ReductionIdentifier::IEOR ||
+                      redId == ReductionProcessor::ReductionIdentifier::IAND) {
                     updateReduction(reductionOp, firOpBuilder, loadVal,
                                     reductionVal);
                   }
@@ -4335,13 +4392,14 @@ bool Fortran::lower::isOpenMPTargetConstruct(
 
 bool Fortran::lower::isOpenMPDeviceDeclareTarget(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OpenMPDeclarativeConstruct &ompDecl) {
   return std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OpenMPDeclareTargetConstruct &ompReq) {
             mlir::omp::DeclareTargetDeviceType targetType =
-                getDeclareTargetFunctionDevice(converter, eval, ompReq)
+                getDeclareTargetFunctionDevice(converter, semaCtx, eval, ompReq)
                     .value_or(mlir::omp::DeclareTargetDeviceType::host);
             return targetType != mlir::omp::DeclareTargetDeviceType::host;
           },
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index ea2064e32a44..e144640081cb 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -333,8 +333,10 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v) {
           else
             type = SourceKind::Global;
 
-          if (fir::valueHasFirAttribute(v,
-                                        fir::GlobalOp::getTargetAttrNameStr()))
+          auto globalOpName = mlir::OperationName(
+              fir::GlobalOp::getOperationName(), defOp->getContext());
+          if (fir::valueHasFirAttribute(
+                  v, fir::GlobalOp::getTargetAttrName(globalOpName)))
             attributes.set(Attribute::Target);
 
           // TODO: Take followBoxAddr into account when setting the pointer
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 141f8fcd3ab5..3cce39f5b8c7 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -271,19 +271,24 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
 
 /// Create a global variable in the (read-only) data section. A global variable
 /// must have a unique name to identify and reference it.
-fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc,
-                                              mlir::Type type,
-                                              llvm::StringRef name,
-                                              mlir::StringAttr linkage,
-                                              mlir::Attribute value,
-                                              bool isConst, bool isTarget) {
+fir::GlobalOp fir::FirOpBuilder::createGlobal(
+    mlir::Location loc, mlir::Type type, llvm::StringRef name,
+    mlir::StringAttr linkage, mlir::Attribute value, bool isConst,
+    bool isTarget, fir::CUDADataAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
     return glob;
   setInsertionPoint(module.getBody(), module.getBody()->end());
-  auto glob =
-      create<fir::GlobalOp>(loc, name, isConst, isTarget, type, value, linkage);
+  llvm::SmallVector<mlir::NamedAttribute> attrs;
+  if (cudaAttr) {
+    auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+                                            module.getContext());
+    attrs.push_back(mlir::NamedAttribute(
+        fir::GlobalOp::getCudaAttrAttrName(globalOpName), cudaAttr));
+  }
+  auto glob = create<fir::GlobalOp>(loc, name, isConst, isTarget, type, value,
+                                    linkage, attrs);
   restoreInsertionPoint(insertPt);
   return glob;
 }
@@ -291,7 +296,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(mlir::Location loc,
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst,
     bool isTarget, std::function<void(FirOpBuilder &)> bodyBuilder,
-    mlir::StringAttr linkage) {
+    mlir::StringAttr linkage, fir::CUDADataAttributeAttr cudaAttr) {
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   if (auto glob = module.lookupSymbol<fir::GlobalOp>(name))
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 94f723b4bae7..4ffa303f2710 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -198,7 +198,8 @@ mlir::Value hlfir::Entity::getFirBase() const {
 fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
-                  fir::FortranVariableFlagsAttr flags) {
+                  fir::FortranVariableFlagsAttr flags,
+                  fir::CUDADataAttributeAttr cudaAttr) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -228,7 +229,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       },
       [](const auto &) {});
   auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, base, name, shapeOrShift, lenParams, flags);
+      loc, base, name, shapeOrShift, lenParams, flags, cudaAttr);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 487109121db0..0cf8dfb9f784 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "mlir/IR/AttributeSupport.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SmallString.h"
@@ -297,5 +298,6 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr>();
+                UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
+                CUDALaunchBoundsAttr, CUDAClusterDimsAttr>();
 }
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 483f318b649d..0a534cdb3c48 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1325,12 +1325,14 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
     if (fir::GlobalOp::verifyValidLinkage(linkage))
       return mlir::failure();
     mlir::StringAttr linkAttr = builder.getStringAttr(linkage);
-    result.addAttribute(fir::GlobalOp::getLinkageAttrNameStr(), linkAttr);
+    result.addAttribute(fir::GlobalOp::getLinkNameAttrName(result.name),
+                        linkAttr);
   }
 
   // Parse the name as a symbol reference attribute.
   mlir::SymbolRefAttr nameAttr;
-  if (parser.parseAttribute(nameAttr, fir::GlobalOp::getSymbolAttrNameStr(),
+  if (parser.parseAttribute(nameAttr,
+                            fir::GlobalOp::getSymrefAttrName(result.name),
                             result.attributes))
     return mlir::failure();
   result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
@@ -1339,7 +1341,8 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
   bool simpleInitializer = false;
   if (mlir::succeeded(parser.parseOptionalLParen())) {
     mlir::Attribute attr;
-    if (parser.parseAttribute(attr, "initVal", result.attributes) ||
+    if (parser.parseAttribute(attr, getInitValAttrName(result.name),
+                              result.attributes) ||
         parser.parseRParen())
       return mlir::failure();
     simpleInitializer = true;
@@ -1348,13 +1351,15 @@ mlir::ParseResult fir::GlobalOp::parse(mlir::OpAsmParser &parser,
   if (parser.parseOptionalAttrDict(result.attributes))
     return mlir::failure();
 
-  if (succeeded(parser.parseOptionalKeyword("constant"))) {
+  if (succeeded(
+          parser.parseOptionalKeyword(getConstantAttrName(result.name)))) {
     // if "constant" keyword then mark this as a constant, not a variable
-    result.addAttribute("constant", builder.getUnitAttr());
+    result.addAttribute(getConstantAttrName(result.name),
+                        builder.getUnitAttr());
   }
 
-  if (succeeded(parser.parseOptionalKeyword("target")))
-    result.addAttribute(getTargetAttrNameStr(), builder.getUnitAttr());
+  if (succeeded(parser.parseOptionalKeyword(getTargetAttrName(result.name))))
+    result.addAttribute(getTargetAttrName(result.name), builder.getUnitAttr());
 
   mlir::Type globalType;
   if (parser.parseColonType(globalType))
@@ -1382,11 +1387,16 @@ void fir::GlobalOp::print(mlir::OpAsmPrinter &p) {
   p.printAttributeWithoutType(getSymrefAttr());
   if (auto val = getValueOrNull())
     p << '(' << val << ')';
-  p.printOptionalAttrDict((*this)->getAttrs(), (*this).getAttributeNames());
-  if (getOperation()->getAttr(fir::GlobalOp::getConstantAttrNameStr()))
-    p << " constant";
+  // Print all other attributes that are not pretty printed here.
+  p.printOptionalAttrDict((*this)->getAttrs(), /*elideAttrs=*/{
+                              getSymNameAttrName(), getSymrefAttrName(),
+                              getTypeAttrName(), getConstantAttrName(),
+                              getTargetAttrName(), getLinkNameAttrName(),
+                              getInitValAttrName()});
+  if (getOperation()->getAttr(getConstantAttrName()))
+    p << " " << getConstantAttrName().strref();
   if (getOperation()->getAttr(getTargetAttrName()))
-    p << " target";
+    p << " " << getTargetAttrName().strref();
   p << " : ";
   p.printType(getType());
   if (hasInitializationBody()) {
@@ -1410,7 +1420,7 @@ void fir::GlobalOp::build(mlir::OpBuilder &builder,
   result.addAttribute(getTypeAttrName(result.name), mlir::TypeAttr::get(type));
   result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
                       builder.getStringAttr(name));
-  result.addAttribute(getSymbolAttrNameStr(),
+  result.addAttribute(getSymrefAttrName(result.name),
                       mlir::SymbolRefAttr::get(builder.getContext(), name));
   if (isConstant)
     result.addAttribute(getConstantAttrName(result.name),
@@ -1420,7 +1430,7 @@ void fir::GlobalOp::build(mlir::OpBuilder &builder,
   if (initialVal)
     result.addAttribute(getInitValAttrName(result.name), initialVal);
   if (linkage)
-    result.addAttribute(getLinkageAttrNameStr(), linkage);
+    result.addAttribute(getLinkNameAttrName(result.name), linkage);
   result.attributes.append(attrs.begin(), attrs.end());
 }
 
@@ -2279,12 +2289,11 @@ mlir::ParseResult fir::DTEntryOp::parse(mlir::OpAsmParser &parser,
   // allow `methodName` or `"methodName"`
   if (failed(parser.parseOptionalKeyword(&methodName))) {
     mlir::StringAttr methodAttr;
-    if (parser.parseAttribute(methodAttr,
-                              fir::DTEntryOp::getMethodAttrNameStr(),
+    if (parser.parseAttribute(methodAttr, getMethodAttrName(result.name),
                               result.attributes))
       return mlir::failure();
   } else {
-    result.addAttribute(fir::DTEntryOp::getMethodAttrNameStr(),
+    result.addAttribute(getMethodAttrName(result.name),
                         parser.getBuilder().getStringAttr(methodName));
   }
   mlir::SymbolRefAttr calleeAttr;
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index ce12e6fd49c6..8bc92a991a69 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -123,14 +123,15 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
                              mlir::OperationState &result, mlir::Value memref,
                              llvm::StringRef uniq_name, mlir::Value shape,
                              mlir::ValueRange typeparams,
-                             fir::FortranVariableFlagsAttr fortran_attrs) {
+                             fir::FortranVariableFlagsAttr fortran_attrs,
+                             fir::CUDADataAttributeAttr cuda_attr) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
   mlir::Type hlfirVariableType =
       getHLFIRVariableType(inputType, hasExplicitLbs);
   build(builder, result, {hlfirVariableType, inputType}, memref, shape,
-        typeparams, nameAttr, fortran_attrs);
+        typeparams, nameAttr, fortran_attrs, cuda_attr);
 }
 
 mlir::LogicalResult hlfir::DeclareOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 603b328a6823..ad569ce3b41f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -21,6 +21,7 @@ add_flang_library(HLFIRTransforms
   FIRBuilder
   FIRDialectSupport
   FIRSupport
+  FIRTransforms
   HLFIRDialect
   MLIRIR
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index b69018560e3a..cd534bae4ad2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -320,12 +320,16 @@ public:
     mlir::Location loc = declareOp->getLoc();
     mlir::Value memref = declareOp.getMemref();
     fir::FortranVariableFlagsAttr fortranAttrs;
+    fir::CUDADataAttributeAttr cudaAttr;
     if (auto attrs = declareOp.getFortranAttrs())
       fortranAttrs =
           fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs);
+    if (auto attr = declareOp.getCudaAttr())
+      cudaAttr = fir::CUDADataAttributeAttr::get(rewriter.getContext(), *attr);
     auto firDeclareOp = rewriter.create<fir::DeclareOp>(
         loc, memref.getType(), memref, declareOp.getShape(),
-        declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs);
+        declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs,
+        cudaAttr);
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index b1165a5ee193..c2512c7df32f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -20,7 +20,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
-#include "flang/Optimizer/Support/Utils.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -854,7 +854,7 @@ public:
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         return builder.createRealConstant(
             loc, elementType,
-            llvm::APFloat::getLargest(sem, /*Negative=*/!isMax));
+            llvm::APFloat::getLargest(sem, /*Negative=*/isMax));
       }
       unsigned bits = elementType.getIntOrFloatBitWidth();
       int64_t limitInt =
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index 29717353da96..675314ed9da0 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -573,6 +573,8 @@ static bool conflictOnLoad(llvm::ArrayRef<mlir::Operation *> reach,
   for (auto *op : reach)
     if (auto ld = mlir::dyn_cast<ArrayLoadOp>(op)) {
       mlir::Type ldTy = ld.getMemref().getType();
+      auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+                                              ld.getContext());
       if (ld.getMemref() == addr) {
         if (mutuallyExclusiveSliceRange(ld, st))
           continue;
@@ -588,14 +590,17 @@ static bool conflictOnLoad(llvm::ArrayRef<mlir::Operation *> reach,
         if (optimize && !hasPointerType(ldTy) &&
             !valueMayHaveFirAttributes(
                 ld.getMemref(),
-                {getTargetAttrName(), GlobalOp::getTargetAttrNameStr()}))
+                {getTargetAttrName(),
+                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
           continue;
 
         return true;
       } else if (hasPointerType(ldTy)) {
         if (optimize && !storeHasPointerType &&
             !valueMayHaveFirAttributes(
-                addr, {getTargetAttrName(), GlobalOp::getTargetAttrNameStr()}))
+                addr,
+                {getTargetAttrName(),
+                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
           continue;
 
         return true;
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index b415463075d6..86343e23c6e5 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -31,8 +31,8 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
-#include "flang/Optimizer/Support/Utils.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Matchers.h"
@@ -558,6 +558,135 @@ static mlir::FunctionType genRuntimeMinlocType(fir::FirOpBuilder &builder,
                                  {boxRefType, boxType, boxType}, {});
 }
 
+// Produces a loop nest for a Minloc intrinsic.
+void fir::genMinMaxlocReductionLoop(
+    fir::FirOpBuilder &builder, mlir::Value array,
+    fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
+    fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
+    mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
+    bool maskMayBeLogicalScalar) {
+  mlir::IndexType idxTy = builder.getIndexType();
+
+  mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+
+  fir::SequenceType::Shape flatShape(rank,
+                                     fir::SequenceType::getUnknownExtent());
+  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
+  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
+  array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);
+
+  mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
+  mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
+  mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
+  builder.create<fir::StoreOp>(loc, zero, flagRef);
+
+  mlir::Value init = initVal(builder, loc, elementType);
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;
+
+  assert(rank > 0 && "rank cannot be zero");
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+
+  // Compute all the upper bounds before the loop nest.
+  // It is not strictly necessary for performance, since the loop nest
+  // does not have any store operations and any LICM optimization
+  // should be able to optimize the redundancy.
+  for (unsigned i = 0; i < rank; ++i) {
+    mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
+    auto dims =
+        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
+    mlir::Value len = dims.getResult(1);
+    // We use C indexing here, so len-1 as loopcount
+    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
+    bounds.push_back(loopCount);
+  }
+  // Create a loop nest consisting of OP operations.
+  // Collect the loops' induction variables into indices array,
+  // which will be used in the innermost loop to load the input
+  // array's element.
+  // The loops are generated such that the innermost loop processes
+  // the 0 dimension.
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
+  for (unsigned i = rank; 0 < i; --i) {
+    mlir::Value step = one;
+    mlir::Value loopCount = bounds[i - 1];
+    auto loop =
+        builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
+                                      /*finalCountValue=*/false, init);
+    init = loop.getRegionIterArgs()[0];
+    indices.push_back(loop.getInductionVar());
+    // Set insertion point to the loop body so that the next loop
+    // is inserted inside the current one.
+    builder.setInsertionPointToStart(loop.getBody());
+  }
+
+  // Reverse the indices such that they are ordered as:
+  //   <dim-0-idx, dim-1-idx, ...>
+  std::reverse(indices.begin(), indices.end());
+  mlir::Value reductionVal =
+      genBody(builder, loc, elementType, array, flagRef, init, indices);
+
+  // Unwind the loop nest and insert ResultOp on each level
+  // to return the updated value of the reduction to the enclosing
+  // loops.
+  for (unsigned i = 0; i < rank; ++i) {
+    auto result = builder.create<fir::ResultOp>(loc, reductionVal);
+    // Proceed to the outer loop.
+    auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
+    reductionVal = loop.getResult(0);
+    // Set insertion point after the loop operation that we have
+    // just processed.
+    builder.setInsertionPointAfter(loop.getOperation());
+  }
+  // End of loop nest. The insertion point is after the outermost loop.
+  if (maskMayBeLogicalScalar) {
+    if (fir::IfOp ifOp =
+            mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
+      builder.create<fir::ResultOp>(loc, reductionVal);
+      builder.setInsertionPointAfter(ifOp);
+      // Redefine flagSet to escape scope of ifOp
+      flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
+      reductionVal = ifOp.getResult(0);
+    }
+  }
+
+  // Check for case where array was full of max values.
+  // flag will be 0 if mask was never true, 1 if mask was true as some point,
+  // this is needed to avoid catching cases where we didn't access any elements
+  // e.g. mask=.FALSE.
+  mlir::Value flagValue =
+      builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
+  mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
+  fir::IfOp ifMaskTrueOp =
+      builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
+  builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());
+
+  mlir::Value testInit = initVal(builder, loc, elementType);
+  fir::IfOp ifMinSetOp;
+  if (elementType.isa<mlir::FloatType>()) {
+    mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
+        loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
+    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
+                                           /*withElseRegion*/ false);
+  } else {
+    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
+    ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
+                                           /*withElseRegion*/ false);
+  }
+  builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());
+
+  // Load output array with 1s instead of 0s
+  for (unsigned int i = 0; i < rank; ++i) {
+    mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value resultElemAddr =
+        getAddrFn(builder, loc, resultElemType, resultArr, index);
+    builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
+  }
+  builder.setInsertionPointAfter(ifMaskTrueOp);
+}
+
 static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                                     mlir::func::FuncOp &funcOp, bool isMax,
                                     unsigned rank, int maskRank,
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index 013fb408214e..0481b3d41f50 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -90,7 +90,11 @@ private:
     auto &dir{std::get<parser::OmpLoopDirective>(beginDir.t)};
 
     nextIt = it;
-    if (++nextIt != block.end()) {
+    while (++nextIt != block.end()) {
+      // Ignore compiler directives.
+      if (GetConstructIf<parser::CompilerDirective>(*nextIt))
+        continue;
+
       if (auto *doCons{GetConstructIf<parser::DoConstruct>(*nextIt)}) {
         if (doCons->GetLoopControl()) {
           // move DoConstruct
@@ -111,12 +115,14 @@ private:
               "DO loop after the %s directive must have loop control"_err_en_US,
               parser::ToUpperCaseLetters(dir.source.ToString()));
         }
-        return; // found do-loop
+      } else {
+        messages_.Say(dir.source,
+            "A DO loop must follow the %s directive"_err_en_US,
+            parser::ToUpperCaseLetters(dir.source.ToString()));
       }
+      // If we get here, we either found a loop, or issued an error message.
+      return;
     }
-    messages_.Say(dir.source,
-        "A DO loop must follow the %s directive"_err_en_US,
-        parser::ToUpperCaseLetters(dir.source.ToString()));
   }
 
   void RewriteOmpAllocations(parser::ExecutionPart &body) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index de71083a21ff..9845a190bc75 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -555,10 +555,11 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) {
           },
           symbol.details());
     }
-    // Sort the data component symbols by offset before emitting them
+    // Sort the data component symbols by offset before emitting them, placing
+    // the parent component first if any.
     std::sort(dataComponentSymbols.begin(), dataComponentSymbols.end(),
         [](const Symbol *x, const Symbol *y) {
-          return x->offset() < y->offset();
+          return x->test(Symbol::Flag::ParentComp) || x->offset() < y->offset();
         });
     std::vector<evaluate::StructureConstructor> dataComponents;
     for (const Symbol *symbol : dataComponentSymbols) {
@@ -1144,7 +1145,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           which = scalarFinalEnum_;
           if (int rank{evaluate::GetRank(typeAndShape.shape())}; rank > 0) {
             which = IntExpr<1>(ToInt64(which).value() + rank);
-            if (!proc->dummyArguments[0].CanBePassedViaImplicitInterface()) {
+            if (dummyData.IsPassedByDescriptor(proc->IsBindC())) {
               argThatMightBeDescriptor = 1;
             }
             if (!typeAndShape.attrs().test(evaluate::characteristics::
@@ -1187,10 +1188,14 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         break;
       }
     }
-    if (argThatMightBeDescriptor != 0 &&
-        !proc->dummyArguments.at(argThatMightBeDescriptor - 1)
-             .CanBePassedViaImplicitInterface()) {
-      isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1);
+    if (argThatMightBeDescriptor != 0) {
+      if (const auto *dummyData{
+              std::get_if<evaluate::characteristics::DummyDataObject>(
+                  &proc->dummyArguments.at(argThatMightBeDescriptor - 1).u)}) {
+        if (dummyData->IsPassedByDescriptor(proc->IsBindC())) {
+          isArgDescriptorSet |= 1 << (argThatMightBeDescriptor - 1);
+        }
+      }
     }
     evaluate::StructureConstructorValues values;
     auto index{evaluate::ToInt64(which)};
diff --git a/flang/module/iso_c_binding.f90 b/flang/module/iso_c_binding.f90
index 9a7e68f33144..1661fd5a6dcf 100644
--- a/flang/module/iso_c_binding.f90
+++ b/flang/module/iso_c_binding.f90
@@ -47,7 +47,11 @@ module iso_c_binding
     c_long_long = c_int64_t, &
     c_signed_char = c_int8_t, &
     c_size_t = kind(c_sizeof(1)), &
+#if __powerpc__
+    c_intmax_t = c_int64_t, &
+#else
     c_intmax_t = c_int128_t, &
+#endif
     c_intptr_t = c_size_t, &
     c_ptrdiff_t = c_size_t
   integer, parameter, public :: &
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 36b7d207ad36..44dbac44772b 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -148,7 +148,7 @@
 ! CHECK-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! CHECK-NEXT: -R<remark>              Enable the specified remark
 ! CHECK-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! CHECK-NEXT: -save-temps             Save intermediate compilation results
+! CHECK-NEXT: -save-temps             Alias for --save-temps=cwd
 ! CHECK-NEXT: -std=<value>            Language standard to compile for
 ! CHECK-NEXT: -S                      Only run preprocess and compilation steps
 ! CHECK-NEXT: --target=<value>        Generate code for the given target
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index f69f43712446..b4280a454e31 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -134,7 +134,7 @@
 ! HELP-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! HELP-NEXT: -R<remark>              Enable the specified remark
 ! HELP-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-NEXT: -save-temps             Save intermediate compilation results
+! HELP-NEXT: -save-temps             Alias for --save-temps=cwd
 ! HELP-NEXT: -std=<value>            Language standard to compile for
 ! HELP-NEXT: -S                      Only run preprocess and compilation steps
 ! HELP-NEXT: --target=<value>        Generate code for the given target
@@ -275,7 +275,7 @@
 ! HELP-FC1-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
 ! HELP-FC1-NEXT: -R<remark>              Enable the specified remark
 ! HELP-FC1-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-FC1-NEXT: -save-temps             Save intermediate compilation results
+! HELP-FC1-NEXT: -save-temps             Alias for --save-temps=cwd
 ! HELP-FC1-NEXT: -std=<value>            Language standard to compile for
 ! HELP-FC1-NEXT: -S                      Only run preprocess and compilation steps
 ! HELP-FC1-NEXT: -target-cpu <value>     Target a specific cpu type
diff --git a/flang/test/Driver/masm.f90 b/flang/test/Driver/masm.f90
new file mode 100644
index 000000000000..c5c44efeb3c8
--- /dev/null
+++ b/flang/test/Driver/masm.f90
@@ -0,0 +1,10 @@
+! RUN: %flang --target=x86_64-unknown-linux -masm=intel -S %s -### 2>&1 | FileCheck --check-prefix=CHECK-INTEL %s
+! RUN: %flang --target=x86_64-unknown-linux -masm=att -S %s -### 2>&1 | FileCheck --check-prefix=CHECK-ATT %s
+! RUN: not %flang --target=x86_64-unknown-linux -S -masm=somerequired %s -### 2>&1 | FileCheck --check-prefix=CHECK-SOMEREQUIRED %s
+! RUN: %flang --target=aarch64-unknown-eabi -S -masm=intel %s -### 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
+
+! CHECK-INTEL: "-mllvm" "-x86-asm-syntax=intel"
+! CHECK-ATT: "-mllvm" "-x86-asm-syntax=att"
+! CHECK-SOMEREQUIRED: error: unsupported argument 'somerequired' to option '-masm='
+! CHECK-AARCH64: warning: argument unused during compilation: '-masm=intel'
+! CHECK-AARCH64-NOT: -x86-asm-syntax=intel
diff --git a/flang/test/Driver/predefined-macros-powerpc.f90 b/flang/test/Driver/predefined-macros-powerpc.f90
new file mode 100644
index 000000000000..b3d2b617fb1f
--- /dev/null
+++ b/flang/test/Driver/predefined-macros-powerpc.f90
@@ -0,0 +1,11 @@
+! Test predefined macro for PowerPC architecture
+
+! RUN: %flang_fc1 -cpp -E %s | FileCheck %s
+! REQUIRES: target=powerpc{{.*}}
+
+! CHECK: integer :: var1 = 1
+
+#if __powerpc__
+  integer :: var1 = __powerpc__
+#endif
+end program
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index 3c4095b9fdb1..962621c4e2e1 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -893,3 +893,10 @@ func.func @test_box_typecode(%a: !fir.class<none>) {
 // CHECK-LABEL: func.func @test_box_typecode(
 // CHECK-SAME: %[[A:.*]]: !fir.class<none>)
 // CHECK: %{{.*}} = fir.box_typecode %[[A]] : (!fir.class<none>) -> i32
+
+fir.global @t1 {keep_my_attr = "data"} : i32 {
+ %1 = arith.constant 0 : i32
+  fir.has_value %1 : i32
+}
+
+// CHECK-LABEL: fir.global @t1 {keep_my_attr = "data"} : i32
diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir
index 67cd9ee4bb75..b4a3ca0d8606 100644
--- a/flang/test/HLFIR/maxloc-elemental.fir
+++ b/flang/test/HLFIR/maxloc-elemental.fir
@@ -110,6 +110,7 @@ func.func @_QPtest_float(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   return
 }
 // CHECK-LABEL: _QPtest_float
+// CHECK:        %cst = arith.constant -3.40282347E+38 : f32
 // CHECK:        %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) {
 // CHECK-NEXT:     %[[V14:.*]] = arith.addi %arg3, %c1 : index
 // CHECK-NEXT:     %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir
index cb483d51e05d..5cc608b65be8 100644
--- a/flang/test/HLFIR/minloc-elemental.fir
+++ b/flang/test/HLFIR/minloc-elemental.fir
@@ -295,6 +295,7 @@ func.func @_QPtest_float(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a
   return
 }
 // CHECK-LABEL: _QPtest_float
+// CHECK:        %cst = arith.constant 3.40282347E+38 : f32
 // CHECK:        %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) {
 // CHECK-NEXT:     %[[V14:.*]] = arith.addi %arg3, %c1 : index
 // CHECK-NEXT:     %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
new file mode 100644
index 000000000000..7596c6b21efb
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s --check-prefix=FIR
+
+! Test lowering of CUDA attribute on variables.
+
+module cuda_var
+  real, constant :: mod_a_rc
+! CHECK: fir.global @_QMcuda_varEmod_a_rc {cuda_attr = #fir.cuda<constant>} : f32 
+  real, device :: mod_b_ra
+! CHECK: fir.global @_QMcuda_varEmod_b_ra {cuda_attr = #fir.cuda<device>} : f32
+  real, allocatable, managed :: mod_c_rm
+! CHECK: fir.global @_QMcuda_varEmod_c_rm {cuda_attr = #fir.cuda<managed>} : !fir.box<!fir.heap<f32>>
+  real, allocatable, pinned :: mod_d_rp
+! CHECK: fir.global @_QMcuda_varEmod_d_rp {cuda_attr = #fir.cuda<pinned>} : !fir.box<!fir.heap<f32>>
+
+contains
+
+subroutine local_var_attrs
+  real, constant :: rc
+  real, device :: rd
+  real, allocatable, managed :: rm
+  real, allocatable, pinned :: rp
+end subroutine
+
+! CHECK-LABEL: func.func @_QMcuda_varPlocal_var_attrs()
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+
+subroutine dummy_arg_constant(dc)
+  real, constant :: dc
+end subroutine
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_constant(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda<constant>}
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFdummy_arg_constantEdc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine dummy_arg_device(dd)
+  real, device :: dd
+end subroutine
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda<device>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine dummy_arg_managed(dm)
+  real, allocatable, managed :: dm
+end subroutine
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda<managed>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+subroutine dummy_arg_pinned(dp)
+  real, allocatable, pinned :: dp
+end subroutine
+! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda<pinned>}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+end module
diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
new file mode 100644
index 000000000000..d9765f6cd2fe
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
@@ -0,0 +1,43 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fcuda %s -o - | fir-opt -convert-hlfir-to-fir | FileCheck %s
+
+! Test lowering of CUDA attribute on procedures.
+
+attributes(host) subroutine sub_host(); end
+! CHECK: func.func @_QPsub_host() attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+
+attributes(device) subroutine sub_device(); end
+! CHECK: func.func @_QPsub_device() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+
+attributes(host) attributes(device) subroutine sub_host_device; end
+! CHECK: func.func @_QPsub_host_device() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(device) attributes(host) subroutine sub_device_host; end
+! CHECK: func.func @_QPsub_device_host() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(global) subroutine sub_global(); end
+! CHECK: func.func @_QPsub_global() attributes {fir.cuda_attr = #fir.cuda_proc<global>}
+
+attributes(grid_global) subroutine sub_grid_global(); end
+! CHECK: func.func @_QPsub_grid_global() attributes {fir.cuda_attr = #fir.cuda_proc<grid_global>}
+
+attributes(host) integer function fct_host(); end
+! CHECK: func.func @_QPfct_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+
+attributes(device) integer function fct_device(); end
+! CHECK: func.func @_QPfct_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+
+attributes(host) attributes(device) integer function fct_host_device; end
+! CHECK: func.func @_QPfct_host_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(device) attributes(host) integer function fct_device_host; end
+! CHECK: func.func @_QPfct_device_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+
+attributes(global) launch_bounds(1, 2) subroutine sub_lbounds1(); end
+! CHECK: func.func @_QPsub_lbounds1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64>}
+
+attributes(global) launch_bounds(1, 2, 3) subroutine sub_lbounds2(); end
+! CHECK: func.func @_QPsub_lbounds2() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>}
+
+attributes(global) cluster_dims(1, 2, 3) subroutine sub_clusterdims1(); end
+! CHECK: func.func @_QPsub_clusterdims1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_cluster_dims = #fir.cluster_dims<x = 1 : i64, y = 2 : i64, z = 3 : i64>}
diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90
index bd96bc8bcba3..df97cbcd187d 100644
--- a/flang/test/Lower/OpenACC/acc-bounds.f90
+++ b/flang/test/Lower/OpenACC/acc-bounds.f90
@@ -126,8 +126,8 @@ contains
   
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
 ! CHECK: %[[BOX:.*]] = fir.if %[[IS_PRESENT]] -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>) {
 ! CHECK:   %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:   fir.result %[[LOAD]] : !fir.box<!fir.ptr<!fir.array<?xf32>>>
@@ -153,4 +153,38 @@ contains
 ! CHECK: %[[ATTACH:.*]] = acc.attach varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ptr<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.data dataOperands(%[[ATTACH]] : !fir.ptr<!fir.array<?xf32>>)
 
+  subroutine acc_optional_data2(a, n)
+    integer :: n
+    real, optional :: a(n)
+    !$acc data no_create(a)
+    !$acc end data
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2(
+! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%10) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
+! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref<!fir.array<?xf32>>) {
+
+  subroutine acc_optional_data3(a, n)
+    integer :: n
+    real, optional :: a(n)
+    !$acc data no_create(a(1:n))
+    !$acc end data
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3(
+! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref<!fir.array<?xf32>>) -> i1
+! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) {
+! CHECK:   %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:   fir.result %[[DIMS]]#2 : index
+! CHECK: } else {
+! CHECK:   fir.result %c0{{.*}} : index
+! CHECK: }
+! CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%[[STRIDE]] : index) startIdx(%c1 : index) {strideInBytes = true}
+! CHECK: %[[NOCREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%14) -> !fir.ref<!fir.array<?xf32>> {name = "a(1:n)"}
+! CHECK: acc.data dataOperands(%[[NOCREATE]] : !fir.ref<!fir.array<?xf32>>) {
+
 end module
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
index 6580aeb13ccd..4b223e822760 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
@@ -27,9 +27,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[IREF]] : !fir.ref<i32>) {
-!CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[IREF]] : i32, !fir.ref<i32>
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -48,9 +50,11 @@ end subroutine
 !CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
 !CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel   reduction(@[[RED_F32_NAME]] -> %[[RREF]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[RREF]] : f32, !fir.ref<f32>
+!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -72,11 +76,15 @@ end subroutine
 !CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[IREF]] : !fir.ref<i32>, @[[RED_F32_NAME]] -> %[[RREF]] : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[RREF]] : f32, !fir.ref<f32>
+!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
+!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
+!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[IREF]] : i32, !fir.ref<i32>
+!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
index 81a93aebbd26..8f3ac3dc357a 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
@@ -28,9 +28,12 @@
 !CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFsimple_int_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -50,9 +53,12 @@ end subroutine
 !CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFsimple_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
-!CHECK:  omp.parallel   reduction(@[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<f32>
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[RES:.+]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
@@ -76,11 +82,17 @@ end subroutine
 !CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
 !CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>, @[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[R_LPRV:.+]] = fir.load %[[RP_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[R_LPRV]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES1]] to %[[RP_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[I_LPRV:.+]] = fir.load %[[IP_DECL]]#0 : !fir.ref<i32>
 !CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    %[[RES0:.+]] = arith.addi %[[I_LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES0]] to %[[IP_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK: return
diff --git a/flang/test/Lower/OpenMP/parallel-reduction.f90 b/flang/test/Lower/OpenMP/parallel-reduction.f90
new file mode 100644
index 000000000000..a07d118b0ba1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction.f90
@@ -0,0 +1,38 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK:  omp.reduction.declare @[[REDUCTION_DECLARE:[_a-z0-9]+]] : i32 init {
+!CHECK:  ^bb0(%{{.*}}: i32):
+!CHECK:    %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    omp.yield(%[[I0]] : i32)
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[C0:[_a-z0-9]+]]: i32, %[[C1:[_a-z0-9]+]]: i32):
+!CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[C0]], %[[C1]] : i32
+!CHECK:    omp.yield(%[[CR]] : i32)
+!CHECK:  }
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+!CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    hlfir.assign %[[C0]] to %[[RED_ACCUM_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.parallel reduction(@[[REDUCTION_DECLARE]] %[[RED_ACCUM_DECL]]#0 -> %[[PRIVATE_RED:[a-z0-9]+]] : !fir.ref<i32>) {
+!CHECK:      %[[PRIVATE_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[PRIVATE_RED]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[C1:[_a-z0-9]+]] = arith.constant 1 : i32
+!CHECK:      hlfir.assign %[[C1]] to %[[PRIVATE_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[RED_ACCUM_VAL:[_a-z0-9]+]] = fir.load %[[RED_ACCUM_DECL]]#0 : !fir.ref<i32>
+!CHECK:    {{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[RED_ACCUM_VAL]]) fastmath<contract> : (!fir.ref<i8>, i32) -> i1
+!CHECK:    return
+!CHECK:  }
+
+program mn
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+      i = 1
+    !$omp end parallel
+
+    print *, i
+end program
diff --git a/flang/test/Lower/target-features-amdgcn.f90 b/flang/test/Lower/target-features-amdgcn.f90
index 1f0439bba80a..382230d7353d 100644
--- a/flang/test/Lower/target-features-amdgcn.f90
+++ b/flang/test/Lower/target-features-amdgcn.f90
@@ -1,21 +1,22 @@
 ! REQUIRES: amdgpu-registered-target
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes=ALL,NONE
-! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=ALL,TRIPLE
-! RUN: %flang_fc1 -emit-fir -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU
-! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a %s -o - | FileCheck %s --check-prefixes=ALL,CPU
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE
+! RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx90a -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
 
 ! ALL: module attributes {
 
-! NONE-NOT: fir.target_cpu
-! NONE-NOT: fir.target_features
-
-! TRIPLE-SAME: fir.target_cpu = "generic-hsa"
-! TRIPLE-NOT: fir.target_features
-
 ! CPU-SAME: fir.target_cpu = "gfx90a"
-! CPU-NOT: fir.target_features
+! CPU-SAME: fir.target_features = #llvm.target_features<[
+! CPU-SAME: "+gfx90a-insts"
+! CPU-SAME: ]>
+
+! FEATURE-SAME: fir.target_features = #llvm.target_features<[
+! FEATURE-NOT:  "+gfx90a-insts"
+! FEATURE-SAME: "+sse"
+! FEATURE-SAME: ]>
 
 ! BOTH-SAME: fir.target_cpu = "gfx90a"
 ! BOTH-SAME: fir.target_features = #llvm.target_features<[
 ! BOTH-SAME: "+gfx90a-insts"
+! BOTH-SAME: "+sse"
 ! BOTH-SAME: ]>
diff --git a/flang/test/Lower/target-features-x86_64.f90 b/flang/test/Lower/target-features-x86_64.f90
index 1b628b6b5b9c..282c47923808 100644
--- a/flang/test/Lower/target-features-x86_64.f90
+++ b/flang/test/Lower/target-features-x86_64.f90
@@ -1,19 +1,17 @@
 ! REQUIRES: x86-registered-target
-! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefixes=ALL,NONE
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o - | FileCheck %s --check-prefixes=ALL,CPU
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,FEATURE
 ! RUN: %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse %s -o - | FileCheck %s --check-prefixes=ALL,BOTH
 
 ! ALL: module attributes {
 
-! NONE-NOT: fir.target_cpu
-! NONE-NOT: fir.target_features
+! CPU-SAME:     fir.target_cpu = "x86-64"
 
-! CPU-SAME: fir.target_cpu = "x86-64"
-! CPU-NOT: fir.target_features
-
-! FEATURE-NOT: fir.target_cpu
-! FEATURE-SAME: fir.target_features = #llvm.target_features<["+sse"]>
+! FEATURE-SAME: fir.target_features = #llvm.target_features<[
+! FEATURE-SAME: "+sse"
+! FEATURE-SAME: ]>
 
 ! BOTH-SAME: fir.target_cpu = "x86-64"
-! BOTH-SAME: fir.target_features = #llvm.target_features<["+sse"]>
+! BOTH-SAME: fir.target_features = #llvm.target_features<[
+! BOTH-SAME: "+sse"
+! BOTH-SAME: ]>
diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90
index 8a28fd8878f4..d2167663c5dd 100644
--- a/flang/test/Semantics/OpenMP/loop-association.f90
+++ b/flang/test/Semantics/OpenMP/loop-association.f90
@@ -30,6 +30,14 @@
      c = c - 1
   END DO outer
 
+  ! Accept directives between parallel do and actual loop.
+  !$OMP PARALLEL DO
+  !DIR$ VECTOR ALIGNED
+  DO 20 i=1,N
+     a = a + 0.5
+20   CONTINUE
+  !$OMP END PARALLEL DO
+
   c = 16
   !ERROR: DO loop after the PARALLEL DO directive must have loop control
   !$omp parallel do
diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90
new file mode 100644
index 000000000000..3527ee6058ad
--- /dev/null
+++ b/flang/test/Semantics/typeinfo09.f90
@@ -0,0 +1,20 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+! test setting of isargdescriptorset in the runtime type info.
+
+module m
+ type :: sometype
+ contains
+  procedure :: copy => copy_impl
+  generic :: assignment(=) => copy
+ end type
+interface
+  subroutine copy_impl(this, x)
+    import
+    class(sometype), intent(out) :: this
+    type(sometype), target, intent(in) :: x
+  end subroutine
+end interface
+end module
+
+!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)]
diff --git a/flang/test/Semantics/typeinfo10.f90 b/flang/test/Semantics/typeinfo10.f90
new file mode 100644
index 000000000000..43c418bf45d0
--- /dev/null
+++ b/flang/test/Semantics/typeinfo10.f90
@@ -0,0 +1,14 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+! Test that empty parent types are still set first in the
+! runtime info global array describing components.
+module empty_parent
+ type :: z
+ end type
+
+ type, extends(z) :: t
+  integer :: a
+ end type
+end module
+! CHECK: .c.t, SAVE{{.*}}.n.z{{.*}}n.a
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 9d5caf5c6804..c9358c83e795 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -256,6 +256,22 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) {
                                      /*Reloc::Model=*/std::nullopt)};
 }
 
+/// Build and execute the OpenMPFIRPassPipeline with its own instance
+/// of the pass manager, allowing it to be invoked as soon as it's
+/// required without impacting the main pass pipeline that may be invoked
+/// more than once for verification.
+static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) {
+  mlir::PassManager pm(mlirModule->getName(),
+                       mlir::OpPassManager::Nesting::Implicit);
+  fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice);
+  (void)mlir::applyPassManagerCLOptions(pm);
+  if (mlir::failed(pm.run(mlirModule))) {
+    llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline";
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // Translate Fortran input to FIR, a dialect of MLIR.
 //===----------------------------------------------------------------------===//
@@ -369,14 +385,16 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
                            "could not open output file ")
            << outputName;
 
+  // WARNING: This pipeline must be run immediately after the lowering to
+  // ensure that the FIR is correct with respect to OpenMP operations/
+  // attributes.
+  if (enableOpenMP)
+    if (mlir::failed(runOpenMPPasses(mlirModule)))
+      return mlir::failure();
+
   // Otherwise run the default passes.
   mlir::PassManager pm(mlirModule->getName(),
                        mlir::OpPassManager::Nesting::Implicit);
-  if (enableOpenMP)
-    // WARNING: This pipeline must be run immediately after the lowering to
-    // ensure that the FIR is correct with respect to OpenMP operations/
-    // attributes.
-    fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice);
   pm.enableVerifier(/*verifyPasses=*/true);
   (void)mlir::applyPassManagerCLOptions(pm);
   if (passPipeline.hasAnyOccurrences()) {
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 42ed2257f580..790f735a6cf2 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -49,7 +49,8 @@ TEST_F(FortranVariableTest, SimpleScalar) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -74,7 +75,8 @@ TEST_F(FortranVariableTest, CharacterScalar) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, typeParams, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -104,7 +106,8 @@ TEST_F(FortranVariableTest, SimpleArray) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, /*typeParams*/ std::nullopt, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
@@ -134,7 +137,8 @@ TEST_F(FortranVariableTest, CharacterArray) {
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, typeParams, name,
-      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{});
+      /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
+      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 70e64a6cb1ae..ef1f24863f61 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -106,13 +106,13 @@ function(get_nvptx_compile_options output_var gpu_arch)
   list(APPEND nvptx_options "-Wno-unknown-cuda-version")
   list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
   if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx60")
+    list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_53")
     list(APPEND nvptx_options "--cuda-feature=+ptx63")
   elseif(${gpu_arch} STREQUAL "sm_60")
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 5b03080787dc..bc09f4881228 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -382,9 +382,11 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5e98538c91b4..02412e7549a3 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -391,9 +391,11 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b35fc9fc6866..75e39ae51fdb 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -112,6 +112,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_trailing_ones_ui
     libc.src.stdbit.stdc_trailing_ones_ul
     libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
@@ -410,9 +415,11 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.fdimf128
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.frexpf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 3af7e106fc99..94604491a73e 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -138,6 +138,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fdiml        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| fdimf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | floor        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | floorf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -174,6 +176,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | frexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| frexpf128    | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index c55529e5c0e9..693a45e63419 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -71,6 +71,21 @@ inline unsigned stdc_trailing_ones(unsigned long x) {
 inline unsigned stdc_trailing_ones(unsigned long long x) {
   return stdc_trailing_ones_ull(x);
 }
+inline unsigned stdc_first_leading_zero(unsigned char x) {
+  return stdc_first_leading_zero_uc(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned short x) {
+  return stdc_first_leading_zero_us(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned x) {
+  return stdc_first_leading_zero_ui(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned long x) {
+  return stdc_first_leading_zero_ul(x);
+}
+inline unsigned stdc_first_leading_zero(unsigned long long x) {
+  return stdc_first_leading_zero_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -100,6 +115,13 @@ inline unsigned stdc_trailing_ones(unsigned long long x) {
       unsigned: stdc_trailing_ones_ui,                                         \
       unsigned long: stdc_trailing_ones_ul,                                    \
       unsigned long long: stdc_trailing_ones_ull)(x)
+#define stdc_first_leading_zero(x)                                             \
+  _Generic((x),                                                                \
+      unsigned char: stdc_first_leading_zero_uc,                               \
+      unsigned short: stdc_first_leading_zero_us,                              \
+      unsigned: stdc_first_leading_zero_ui,                                    \
+      unsigned long: stdc_first_leading_zero_ul,                               \
+      unsigned long long: stdc_first_leading_zero_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/include/math.h.def b/libc/include/math.h.def
index 813bb72c9b45..927e2d6697c6 100644
--- a/libc/include/math.h.def
+++ b/libc/include/math.h.def
@@ -11,6 +11,8 @@
 
 #include <__llvm-libc-common.h>
 #include <llvm-libc-macros/math-macros.h>
+#include <llvm-libc-types/float128.h>
+
 
 %%public_api()
 
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 0b557c807a54..aebf49553828 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -176,6 +176,10 @@ class FunctionSpec<string name, RetValSpec return, list<ArgSpec> args> {
   list<ArgSpec> Args = args;
 }
 
+class GuardedFunctionSpec<string name, RetValSpec return, list<ArgSpec> args, string guard_macro> : FunctionSpec<name, return, args> {
+  string Guard = guard_macro;
+}
+
 class ObjectSpec<string name, string type> {
   string Name = name;
   string Type = type;
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 97dabbc5cf07..1720a4a3c3aa 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -359,36 +359,37 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"copysign", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"copysignf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"copysignl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"copysignf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"copysignf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"ceil", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"ceilf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"ceill", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fabs", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"fabsf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fdiml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"fdimf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"floor", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"floorf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"floorl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"floorf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"floorf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fmin", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fminf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fminl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fmax", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fmaxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -400,6 +401,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
+          GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"hypot", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"hypotf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
@@ -461,7 +463,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"round", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"roundf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"roundl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"roundf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"roundf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"lround", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"lroundf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
@@ -486,12 +488,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"sqrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"sqrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"sqrtl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>, 
+          GuardedFunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"trunc", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"truncf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"truncl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"truncf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>,
+          GuardedFunctionSpec<"truncf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"nearbyint", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"nearbyintf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
@@ -778,7 +780,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_leading_zeros">,
         Macro<"stdc_leading_ones">,
         Macro<"stdc_trailing_zeros">,
-        Macro<"stdc_trailing_ones">
+        Macro<"stdc_trailing_ones">,
+        Macro<"stdc_first_leading_zero">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -802,7 +805,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_trailing_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_trailing_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_trailing_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_trailing_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_trailing_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_first_leading_zero_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_first_leading_zero_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_first_leading_zero_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 122f6b8c3328..392fbe248138 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -238,6 +238,36 @@ LIBC_INLINE constexpr To bit_or_static_cast(const From &from) {
   }
 }
 
+#define SPECIALIZE_FLZ(NAME, TYPE, BUILTIN)                                    \
+  template <> [[nodiscard]] LIBC_INLINE constexpr int NAME<TYPE>(TYPE value) { \
+    static_assert(cpp::is_unsigned_v<TYPE>);                                   \
+    return value == cpp::numeric_limits<TYPE>::max()                           \
+               ? 0                                                             \
+               : BUILTIN(static_cast<TYPE>(~value)) + 1;                       \
+  }
+
+template <typename T, typename = cpp::enable_if_t<cpp::is_unsigned_v<T>>>
+[[nodiscard]] LIBC_INLINE constexpr int first_leading_zero(T value) {
+  return value == cpp::numeric_limits<T>::max()
+             ? 0
+             : countl_zero(static_cast<T>(~value)) + 1;
+}
+
+#if LIBC_HAS_BUILTIN(__builtin_clzs)
+SPECIALIZE_FLZ(first_leading_zero, unsigned short, __builtin_clzs)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clz)
+SPECIALIZE_FLZ(first_leading_zero, unsigned int, __builtin_clz)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clzl)
+SPECIALIZE_FLZ(first_leading_zero, unsigned long, __builtin_clzl)
+#endif
+#if LIBC_HAS_BUILTIN(__builtin_clzll)
+SPECIALIZE_FLZ(first_leading_zero, unsigned long long, __builtin_clzll)
+#endif
+
+#undef SPECIALIZE_FLZ
+
 } // namespace LIBC_NAMESPACE::cpp
 
 #endif // LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index ccc61a89c5f8..a19d6d0bef08 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -19,9 +19,7 @@ namespace fputil {
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T abs(T x) {
-  FPBits<T> bits(x);
-  bits.set_sign(Sign::POS);
-  return bits.get_val();
+  return FPBits<T>(x).abs().get_val();
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 58bbe29cb3a7..75f0b5744ebd 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -17,9 +17,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a lane.
-constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
-
 /// Type aliases to the address spaces used by the AMDGPU backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,8 +105,11 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+/// and compilation options.
+LIBC_INLINE uint32_t get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
@@ -152,14 +152,7 @@ LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 /// Returns a fixed-frequency timestamp. The actual frequency is dependent on
 /// the card and can only be queried via the driver.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
-    return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
-    return __builtin_amdgcn_s_memrealtime();
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
-    return __builtin_amdgcn_s_memtime();
-  else
-    return 0;
+  return __builtin_readsteadycounter();
 }
 
 /// Terminates execution of the associated wavefront.
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index 00b59837ccc6..58db88dce1ca 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -16,8 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-constexpr const uint64_t LANE_SIZE = 1;
-
 template <typename T> using Private = T;
 template <typename T> using Constant = T;
 template <typename T> using Shared = T;
@@ -55,7 +53,7 @@ LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
 
 LIBC_INLINE uint64_t get_thread_id() { return 0; }
 
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+LIBC_INLINE uint32_t get_lane_size() { return 1; }
 
 LIBC_INLINE uint32_t get_lane_id() { return 0; }
 
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index e7e297adf7ec..a92c8847b6ec 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -16,9 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a warp.
-constexpr const uint64_t LANE_SIZE = 32;
-
 /// Type aliases to the address spaces used by the NVPTX backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -95,8 +92,8 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of a CUDA warp.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+LIBC_INLINE uint32_t get_lane_size() { return 32; }
 
 /// Returns the id of the thread inside of a CUDA warp executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
@@ -113,21 +110,13 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
                                                            uint32_t x) {
   uint32_t mask = static_cast<uint32_t>(lane_mask);
   uint32_t id = __builtin_ffs(mask) - 1;
-#if __CUDA_ARCH__ >= 600
   return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-#else
-  return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
-#endif
 }
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
   uint32_t mask = static_cast<uint32_t>(lane_mask);
-#if __CUDA_ARCH__ >= 600
   return __nvvm_vote_ballot_sync(mask, x);
-#else
-  return mask & __nvvm_vote_ballot(x);
-#endif
 }
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
@@ -138,13 +127,11 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 }
 
 /// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() {
-  return __nvvm_read_ptx_sreg_clock64();
-}
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 
 /// Returns a global fixed-frequency timer at nanosecond frequency.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __nvvm_read_ptx_sreg_globaltimer();
+  return __builtin_readsteadycounter();
 }
 
 /// Terminates execution of the calling thread.
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 7924d4cec2ac..5ed39ae0d7f7 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -43,19 +43,6 @@ struct Header {
   uint16_t opcode;
 };
 
-/// The data payload for the associated packet. We provide enough space for each
-/// thread in the cooperating lane to have a buffer.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct Payload {
-  Buffer slot[lane_size];
-};
-
-/// A packet used to share data between the client and server across an entire
-/// lane. We use a lane as the minimum granularity for execution.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
-  Header header;
-  Payload<lane_size> payload;
-};
-
 /// The maximum number of parallel ports that the RPC interface can support.
 constexpr uint64_t MAX_PORT_COUNT = 4096;
 
@@ -71,7 +58,7 @@ constexpr uint64_t MAX_PORT_COUNT = 4096;
 ///   - The client will always start with a 'send' operation.
 ///   - The server will always start with a 'recv' operation.
 ///   - Every 'send' or 'recv' call is mirrored by the other process.
-template <bool Invert, typename Packet> struct Process {
+template <bool Invert> struct Process {
   LIBC_INLINE Process() = default;
   LIBC_INLINE Process(const Process &) = delete;
   LIBC_INLINE Process &operator=(const Process &) = delete;
@@ -82,7 +69,8 @@ template <bool Invert, typename Packet> struct Process {
   uint32_t port_count = 0;
   cpp::Atomic<uint32_t> *inbox = nullptr;
   cpp::Atomic<uint32_t> *outbox = nullptr;
-  Packet *packet = nullptr;
+  Header *header = nullptr;
+  Buffer *packet = nullptr;
 
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
@@ -92,7 +80,9 @@ template <bool Invert, typename Packet> struct Process {
                                     advance(buffer, inbox_offset(port_count)))),
         outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
             advance(buffer, outbox_offset(port_count)))),
-        packet(reinterpret_cast<Packet *>(
+        header(reinterpret_cast<Header *>(
+            advance(buffer, header_offset(port_count)))),
+        packet(reinterpret_cast<Buffer *>(
             advance(buffer, buffer_offset(port_count)))) {}
 
   /// Allocate a memory buffer sufficient to store the following equivalent
@@ -101,10 +91,12 @@ template <bool Invert, typename Packet> struct Process {
   /// struct Equivalent {
   ///   Atomic<uint32_t> primary[port_count];
   ///   Atomic<uint32_t> secondary[port_count];
-  ///   Packet buffer[port_count];
+  ///   Header header[port_count];
+  ///   Buffer packet[port_count][lane_size];
   /// };
-  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
-    return buffer_offset(port_count) + buffer_bytes(port_count);
+  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
+                                                        uint32_t lane_size) {
+    return buffer_offset(port_count) + buffer_bytes(port_count, lane_size);
   }
 
   /// Retrieve the inbox state from memory shared between processes.
@@ -144,6 +136,13 @@ template <bool Invert, typename Packet> struct Process {
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
 
+  /// The packet is a linearly allocated array of buffers used to communicate
+  /// with the other process. This function returns the appropriate slot in this
+  /// array such that the process can operate on an entire warp or wavefront.
+  LIBC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
+    return &packet[index * lane_size];
+  }
+
   /// Determines if this process needs to wait for ownership of the buffer. We
   /// invert the condition on one of the processes to indicate that if one
   /// process owns the buffer then the other does not.
@@ -219,8 +218,9 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Number of bytes to allocate for the buffer containing the packets.
-  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
-    return port_count * sizeof(Packet);
+  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
+                                                     uint32_t lane_size) {
+    return port_count * lane_size * sizeof(Buffer);
   }
 
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
@@ -234,8 +234,14 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
+  LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
+    return align_up(2 * mailbox_bytes(port_count), alignof(Header));
+  }
+
+  /// Offset of the buffer containing the packets after the inbox and outbox.
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
-    return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
+    return align_up(header_offset(port_count) + port_count * sizeof(Header),
+                    alignof(Buffer));
   }
 
   /// Conditionally set the n-th bit in the atomic bitfield.
@@ -262,39 +268,39 @@ template <bool Invert, typename Packet> struct Process {
 };
 
 /// Invokes a function accross every active buffer across the total lane size.
-template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *)> fn,
-                                   Packet<lane_size> &packet) {
+                                   uint32_t lane_size, uint64_t lane_mask,
+                                   Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()]);
+    fn(&slot[gpu::get_lane_id()]);
   } else {
     for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i]);
+      if (lane_mask & (1ul << i))
+        fn(&slot[i]);
   }
 }
 
 /// Alternate version that also provides the index of the current lane.
-template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
-                                   Packet<lane_size> &packet) {
+                                   uint32_t lane_size, uint64_t lane_mask,
+                                   Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()], gpu::get_lane_id());
+    fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
   } else {
     for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i], i);
+      if (lane_mask & (1ul << i))
+        fn(&slot[i], i);
   }
 }
 
 /// The port provides the interface to communicate between the multiple
 /// processes. A port is conceptually an index into the memory provided by the
 /// underlying process that is guarded by a lock bit.
-template <bool T, typename S> struct Port {
-  LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
-                   uint32_t out)
-      : process(process), lane_mask(lane_mask), index(index), out(out),
-        receive(false), owns_buffer(true) {}
+template <bool T> struct Port {
+  LIBC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
+                   uint32_t index, uint32_t out)
+      : process(process), lane_mask(lane_mask), lane_size(lane_size),
+        index(index), out(out), receive(false), owns_buffer(true) {}
   LIBC_INLINE ~Port() = default;
 
 private:
@@ -305,7 +311,7 @@ private:
 
   friend struct Client;
   template <uint32_t U> friend struct Server;
-  friend class cpp::optional<Port<T, S>>;
+  friend class cpp::optional<Port<T>>;
 
 public:
   template <typename U> LIBC_INLINE void recv(U use);
@@ -319,7 +325,7 @@ public:
   LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
 
   LIBC_INLINE uint16_t get_opcode() const {
-    return process.packet[index].header.opcode;
+    return process.header[index].opcode;
   }
 
   LIBC_INLINE uint16_t get_index() const { return index; }
@@ -333,8 +339,9 @@ public:
   }
 
 private:
-  Process<T, S> &process;
+  Process<T> &process;
   uint64_t lane_mask;
+  uint32_t lane_size;
   uint32_t index;
   uint32_t out;
   bool receive;
@@ -351,15 +358,14 @@ struct Client {
   LIBC_INLINE Client(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
+  using Port = rpc::Port<false>;
   template <uint16_t opcode> LIBC_INLINE Port open();
 
 private:
-  Process<false, Packet<gpu::LANE_SIZE>> process;
+  Process<false> process;
 };
 static_assert(cpp::is_trivially_copyable<Client>::value &&
-                  sizeof(Process<false, Packet<1>>) ==
-                      sizeof(Process<false, Packet<32>>),
+                  sizeof(Process<true>) == sizeof(Process<false>),
               "The client is not trivially copyable from the server");
 
 /// The RPC server used to respond to the client.
@@ -372,38 +378,35 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE Server(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<true, Packet<lane_size>>;
+  using Port = rpc::Port<true>;
   LIBC_INLINE cpp::optional<Port> try_open(uint32_t start = 0);
   LIBC_INLINE Port open();
 
   LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
-    return Process<true, Packet<lane_size>>::allocation_size(port_count);
+    return Process<true>::allocation_size(port_count, lane_size);
   }
 
 private:
-  Process<true, Packet<lane_size>> process;
+  Process<true> process;
 };
 
 /// Applies \p fill to the shared buffer and initiates a send operation.
-template <bool T, typename S>
-template <typename F>
-LIBC_INLINE void Port<T, S>::send(F fill) {
+template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
   uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
-  invoke_rpc(fill, process.packet[index]);
+  invoke_rpc(fill, lane_size, process.header[index].mask,
+             process.get_packet(index, lane_size));
   out = process.invert_outbox(index, out);
   owns_buffer = false;
   receive = false;
 }
 
 /// Applies \p use to the shared buffer and acknowledges the send.
-template <bool T, typename S>
-template <typename U>
-LIBC_INLINE void Port<T, S>::recv(U use) {
+template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
   // We only exchange ownership of the buffer during a receive if we are waiting
   // for a previous receive to finish.
   if (receive) {
@@ -417,15 +420,16 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
-  invoke_rpc(use, process.packet[index]);
+  invoke_rpc(use, lane_size, process.header[index].mask,
+             process.get_packet(index, lane_size));
   receive = true;
   owns_buffer = true;
 }
 
 /// Combines a send and receive into a single function.
-template <bool T, typename S>
+template <bool T>
 template <typename F, typename U>
-LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
+LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
   send(fill);
   recv(use);
 }
@@ -433,17 +437,17 @@ LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
 /// Combines a receive and send operation into a single function. The \p work
 /// function modifies the buffer in-place and the send is only used to initiate
 /// the copy back.
-template <bool T, typename S>
+template <bool T>
 template <typename W>
-LIBC_INLINE void Port<T, S>::recv_and_send(W work) {
+LIBC_INLINE void Port<T>::recv_and_send(W work) {
   recv(work);
   send([](Buffer *) { /* no-op */ });
 }
 
 /// Helper routine to simplify the interface when sending from the GPU using
 /// thread private pointers to the underlying value.
-template <bool T, typename S>
-LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
   const void **src_ptr = &src;
   uint64_t *size_ptr = &size;
   send_n(src_ptr, size_ptr);
@@ -451,8 +455,8 @@ LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
 
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
-template <bool T, typename S>
-LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
     reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
@@ -465,7 +469,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
     rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -481,9 +485,9 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
-template <bool T, typename S>
+template <bool T>
 template <typename A>
-LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
+LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   uint64_t num_recvs = 0;
   recv([&](Buffer *buffer, uint32_t id) {
     lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
@@ -498,7 +502,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
     rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -515,8 +519,10 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// only open a port if we find an index that is in a valid sending state. That
 /// is, there are send operations pending that haven't been serviced on this
 /// port. Each port instance uses an associated \p opcode to tell the server
-/// what to do.
-template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
+/// what to do. The Client interface provides the appropriate lane size to the
+/// port using the platform's returned value.
+template <uint16_t opcode>
+[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
   for (uint32_t index = gpu::get_cluster_id();; ++index) {
@@ -540,11 +546,11 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
     }
 
     if (gpu::is_first_lane(lane_mask)) {
-      process.packet[index].header.opcode = opcode;
-      process.packet[index].header.mask = lane_mask;
+      process.header[index].opcode = opcode;
+      process.header[index].mask = lane_mask;
     }
     gpu::sync_lane(lane_mask);
-    return Port(process, lane_mask, index, out);
+    return Port(process, lane_mask, gpu::get_lane_size(), index, out);
   }
 }
 
@@ -577,7 +583,7 @@ template <uint32_t lane_size>
       continue;
     }
 
-    return Port(process, lane_mask, index, out);
+    return Port(process, lane_mask, lane_size, index, out);
   }
   return cpp::nullopt;
 }
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index ff9569298a1e..cc2a11a1108e 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -21,8 +21,9 @@ namespace rpc {
 
 /// Suspend the thread briefly to assist the thread scheduler during busy loops.
 LIBC_INLINE void sleep_briefly() {
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
-  __nvvm_nanosleep(64);
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
   __builtin_amdgcn_s_sleep(2);
 #elif defined(LIBC_TARGET_ARCH_IS_X86)
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index d4dbeebb7b21..985585cbfb89 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -111,6 +111,7 @@ add_math_entrypoint_object(fabsf128)
 add_math_entrypoint_object(fdim)
 add_math_entrypoint_object(fdimf)
 add_math_entrypoint_object(fdiml)
+add_math_entrypoint_object(fdimf128)
 
 add_math_entrypoint_object(floor)
 add_math_entrypoint_object(floorf)
@@ -136,6 +137,7 @@ add_math_entrypoint_object(fmodf)
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
 add_math_entrypoint_object(frexpl)
+add_math_entrypoint_object(frexpf128)
 
 add_math_entrypoint_object(hypot)
 add_math_entrypoint_object(hypotf)
diff --git a/libc/src/math/fdimf128.h b/libc/src/math/fdimf128.h
new file mode 100644
index 000000000000..c6f488a586dc
--- /dev/null
+++ b/libc/src/math/fdimf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fdimf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FDIMF128_H
+#define LLVM_LIBC_SRC_MATH_FDIMF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 fdimf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FDIMF128_H
diff --git a/libc/src/math/frexpf128.h b/libc/src/math/frexpf128.h
new file mode 100644
index 000000000000..5d70860fa155
--- /dev/null
+++ b/libc/src/math/frexpf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for frexpf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FREXPF128_H
+#define LLVM_LIBC_SRC_MATH_FREXPF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 frexpf128(float128 x, int *exp);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FREXPF128_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 05b70be7b7b9..fdf383f07069 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -43,6 +43,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -215,6 +216,7 @@ add_entrypoint_object(
   HDRS
     ../fabsf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -265,6 +267,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -313,6 +316,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -361,6 +365,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
@@ -899,6 +904,7 @@ add_entrypoint_object(
   HDRS
     ../copysignf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -910,10 +916,10 @@ add_entrypoint_object(
     frexp.cpp
   HDRS
     ../frexp.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -922,10 +928,10 @@ add_entrypoint_object(
     frexpf.cpp
   HDRS
     ../frexpf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -934,10 +940,23 @@ add_entrypoint_object(
     frexpl.cpp
   HDRS
     ../frexpl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  frexpf128
+  SRCS
+    frexpf128.cpp
+  HDRS
+    ../frexpf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
@@ -1298,6 +1317,7 @@ add_entrypoint_object(
   HDRS
     ../fminf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1346,6 +1366,7 @@ add_entrypoint_object(
   HDRS
     ../fmaxf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
@@ -1394,6 +1415,7 @@ add_entrypoint_object(
   HDRS
     ../sqrtf128.h
   DEPENDS
+    libc.src.__support.macros.properties.float
     libc.src.__support.FPUtil.sqrt 
   COMPILE_OPTIONS
     -O3
@@ -1491,10 +1513,10 @@ add_entrypoint_object(
     fdim.cpp
   HDRS
     ../fdim.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1503,10 +1525,10 @@ add_entrypoint_object(
     fdimf.cpp
   HDRS
     ../fdimf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1515,10 +1537,23 @@ add_entrypoint_object(
     fdiml.cpp
   HDRS
     ../fdiml.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
+)
+
+add_entrypoint_object(
+  fdimf128
+  SRCS
+    fdimf128.cpp
+  HDRS
+    ../fdimf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.basic_operations
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index 54b66bf42fc6..a4a75a7b0438 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -33,12 +33,8 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
   }
 
   if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
-    // Check for exceptional values.
-    uint32_t x_abs = xbits.abs().uintval();
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      // x is +inf or NaN.
+    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
       return x;
-    }
 
     // Helper functions to set results for exceptional cases.
     auto round_result_slightly_down = [](float r) -> float {
diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp
index ac059910b4ef..6e351786e3ec 100644
--- a/libc/src/math/generic/asinhf.cpp
+++ b/libc/src/math/generic/asinhf.cpp
@@ -59,10 +59,8 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
   };
 
   if (LIBC_UNLIKELY(x_abs >= 0x4bdd'65a5U)) {
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      // x is +-inf or nan
+    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
       return x;
-    }
 
     // Exceptional cases when x > 2^24.
     switch (x_abs) {
diff --git a/libc/src/math/generic/fdimf128.cpp b/libc/src/math/generic/fdimf128.cpp
new file mode 100644
index 000000000000..a3ea9e591610
--- /dev/null
+++ b/libc/src/math/generic/fdimf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of fdimf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdimf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, fdimf128, (float128 x, float128 y)) {
+  return fputil::fdim(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/frexpf128.cpp b/libc/src/math/generic/frexpf128.cpp
new file mode 100644
index 000000000000..b50f37d2dab4
--- /dev/null
+++ b/libc/src/math/generic/frexpf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of frexpf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexpf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, frexpf128, (float128 x, int *exp)) {
+  return fputil::frexp(x, *exp);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 129621f13928..65d5f3443074 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -3,6 +3,7 @@ set(prefixes
   leading_ones
   trailing_zeros
   trailing_ones
+  first_leading_zero
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_first_leading_zero_uc.cpp b/libc/src/stdbit/stdc_first_leading_zero_uc.cpp
new file mode 100644
index 000000000000..ffc1d9247406
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_uc.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_uc ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_uc,
+                   (unsigned char value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_uc.h b/libc/src/stdbit/stdc_first_leading_zero_uc.h
new file mode 100644
index 000000000000..63ecd20acc61
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_uc ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UC_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ui.cpp b/libc/src/stdbit/stdc_first_leading_zero_ui.cpp
new file mode 100644
index 000000000000..1eeab2963e6a
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_first_leading_zero_ui ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ui.h b/libc/src/stdbit/stdc_first_leading_zero_ui.h
new file mode 100644
index 000000000000..d8d5d9345010
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ui ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UI_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ul.cpp b/libc/src/stdbit/stdc_first_leading_zero_ul.cpp
new file mode 100644
index 000000000000..6743d3eda516
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ul.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_ul ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ul,
+                   (unsigned long value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ul.h b/libc/src/stdbit/stdc_first_leading_zero_ul.h
new file mode 100644
index 000000000000..8df1b55c2c3a
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ul ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_UL_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ull.cpp b/libc/src/stdbit/stdc_first_leading_zero_ull.cpp
new file mode 100644
index 000000000000..8128dd3d59a7
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ull.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_ull ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_ull,
+                   (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_ull.h b/libc/src/stdbit/stdc_first_leading_zero_ull.h
new file mode 100644
index 000000000000..9aec5e7e5545
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_ull ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_ULL_H
diff --git a/libc/src/stdbit/stdc_first_leading_zero_us.cpp b/libc/src/stdbit/stdc_first_leading_zero_us.cpp
new file mode 100644
index 000000000000..d931535e7690
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_us.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of stdc_first_leading_zero_us ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_first_leading_zero_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_first_leading_zero_us,
+                   (unsigned short value)) {
+  return static_cast<unsigned>(cpp::first_leading_zero(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_first_leading_zero_us.h b/libc/src/stdbit/stdc_first_leading_zero_us.h
new file mode 100644
index 000000000000..8587378069ce
--- /dev/null
+++ b/libc/src/stdbit/stdc_first_leading_zero_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_first_leading_zero_us ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_first_leading_zero_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_FIRST_LEADING_ZERO_US_H
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index e84fe622100e..dd669ff46c75 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -12,29 +12,31 @@
 
 namespace LIBC_NAMESPACE {
 
-constexpr uint64_t TICKS_PER_NS = 1000000000UL;
+constexpr uint64_t TICKS_PER_SEC = 1000000000UL;
 
 LLVM_LIBC_FUNCTION(int, nanosleep,
                    (const struct timespec *req, struct timespec *rem)) {
   if (!GPU_CLOCKS_PER_SEC || !req)
     return -1;
 
-  uint64_t nsecs = req->tv_nsec + req->tv_sec * TICKS_PER_NS;
+  uint64_t nsecs = req->tv_nsec + req->tv_sec * TICKS_PER_SEC;
+  uint64_t tick_rate = TICKS_PER_SEC / GPU_CLOCKS_PER_SEC;
 
   uint64_t start = gpu::fixed_frequency_clock();
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
-  uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
   uint64_t cur = gpu::fixed_frequency_clock();
   // The NVPTX architecture supports sleeping and guaruntees the actual time
   // slept will be somewhere between zero and twice the requested amount. Here
   // we will sleep again if we undershot the time.
   while (cur < end) {
-    __nvvm_nanosleep(static_cast<uint32_t>(nsecs));
+    if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+      LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs));
     cur = gpu::fixed_frequency_clock();
     nsecs -= nsecs > cur - start ? cur - start : 0;
   }
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-  uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+  uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
   uint64_t cur = gpu::fixed_frequency_clock();
   // The AMDGPU architecture does not provide a sleep implementation with a
   // known delay so we simply repeatedly sleep with a large value of ~960 clock
@@ -56,11 +58,11 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
 
   // Check to make sure we slept for at least the desired duration and set the
   // remaining time if not.
-  uint64_t elapsed = (stop - start) * (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
+  uint64_t elapsed = (stop - start) * tick_rate;
   if (elapsed < nsecs) {
     if (rem) {
-      rem->tv_sec = (nsecs - elapsed) / TICKS_PER_NS;
-      rem->tv_nsec = (nsecs - elapsed) % TICKS_PER_NS;
+      rem->tv_sec = (nsecs - elapsed) / TICKS_PER_SEC;
+      rem->tv_nsec = (nsecs - elapsed) % TICKS_PER_SEC;
     }
     return -1;
   }
diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h
index 531a748665b0..8a9a5f0f65b8 100644
--- a/libc/src/time/gpu/time_utils.h
+++ b/libc/src/time/gpu/time_utils.h
@@ -15,24 +15,13 @@ namespace LIBC_NAMESPACE {
 
 #if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
 // AMDGPU does not have a single set frequency. Different architectures and
-// cards can have vary values. Here we default to a few known values, but for
-// complete support the frequency needs to be read from the kernel driver.
-#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) ||          \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-// These architectures use a 100 MHz fixed frequency clock.
-constexpr uint64_t clock_freq = 100000000;
-#elif defined(__GFX9__)
-// These architectures use a 25 MHz fixed frequency clock expect for Vega 10
-// which is actually 27 Mhz. We default to 25 MHz in all cases anyway.
-constexpr uint64_t clock_freq = 25000000;
-#else
-// The frequency for these architecture is unknown. We simply default to zero.
-constexpr uint64_t clock_freq = 0;
-#endif
+// cards can have different values. The actualy frequency needs to be read from
+// the kernel driver and will be between 25 MHz and 100 MHz on most cards. All
+// cards following the GFX9 ISAs use a 100 MHz clock so we will default to that.
+constexpr uint64_t clock_freq = 100000000UL;
 
 // We provide an externally visible symbol such that the runtime can set
-// this to the correct value. If it is not set we try to default to the
-// known values.
+// this to the correct value.
 extern "C" [[gnu::visibility("protected")]] uint64_t
     [[clang::address_space(4)]] __llvm_libc_clock_freq;
 #define GPU_CLOCKS_PER_SEC static_cast<clock_t>(__llvm_libc_clock_freq)
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 8d5d4c162938..9a66a76de20b 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -43,6 +43,13 @@ unsigned stdc_trailing_ones_us(unsigned short) noexcept { return 0xDBU; }
 unsigned stdc_trailing_ones_ui(unsigned) noexcept { return 0xDCU; }
 unsigned stdc_trailing_ones_ul(unsigned long) noexcept { return 0xDDU; }
 unsigned stdc_trailing_ones_ull(unsigned long long) noexcept { return 0xDFU; }
+unsigned stdc_first_leading_zero_uc(unsigned char) noexcept { return 0xEAU; }
+unsigned stdc_first_leading_zero_us(unsigned short) noexcept { return 0xEBU; }
+unsigned stdc_first_leading_zero_ui(unsigned) noexcept { return 0xECU; }
+unsigned stdc_first_leading_zero_ul(unsigned long) noexcept { return 0xEDU; }
+unsigned stdc_first_leading_zero_ull(unsigned long long) noexcept {
+  return 0xEFU;
+}
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -78,3 +85,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroTrailingOnes) {
   EXPECT_EQ(stdc_trailing_ones(0UL), 0xDDU);
   EXPECT_EQ(stdc_trailing_ones(0ULL), 0xDFU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroFirstLeadingZero) {
+  EXPECT_EQ(stdc_first_leading_zero(static_cast<unsigned char>(0U)), 0xEAU);
+  EXPECT_EQ(stdc_first_leading_zero(static_cast<unsigned short>(0U)), 0xEBU);
+  EXPECT_EQ(stdc_first_leading_zero(0U), 0xECU);
+  EXPECT_EQ(stdc_first_leading_zero(0UL), 0xEDU);
+  EXPECT_EQ(stdc_first_leading_zero(0ULL), 0xEFU);
+}
diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
index fef551bc1f0e..00d8ca5d293a 100644
--- a/libc/test/src/__support/CPP/bit_test.cpp
+++ b/libc/test/src/__support/CPP/bit_test.cpp
@@ -206,4 +206,11 @@ TEST(LlvmLibcBitTest, Rotr) {
             rotr<uint64_t>(0x12345678deadbeefULL, -19));
 }
 
+TYPED_TEST(LlvmLibcBitTest, FirstLeadingZero, UnsignedTypes) {
+  EXPECT_EQ(first_leading_zero<T>(cpp::numeric_limits<T>::max()), 0);
+  for (int i = 0U; i != cpp::numeric_limits<T>::digits; ++i)
+    EXPECT_EQ(first_leading_zero<T>(~(T(1) << i)),
+              cpp::numeric_limits<T>::digits - i);
+}
+
 } // namespace LIBC_NAMESPACE::cpp
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index 4504a4f0cfcc..b1c4b6691b6e 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -265,7 +265,7 @@ template <typename T> constexpr auto make(Sign sign, FP fp) {
   case FP::QUIET_NAN:
     return T::quiet_nan(sign);
   }
-};
+}
 
 // Tests all properties for all types of float.
 TYPED_TEST(LlvmLibcFPBitsTest, Properties, FPTypes) {
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
index 54821e21f9cc..58b318c7cfa6 100644
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -13,12 +13,8 @@
 namespace {
 enum { lane_size = 8, port_count = 4 };
 
-struct Packet {
-  uint64_t unused;
-};
-
-using ProcAType = LIBC_NAMESPACE::rpc::Process<false, Packet>;
-using ProcBType = LIBC_NAMESPACE::rpc::Process<true, Packet>;
+using ProcAType = LIBC_NAMESPACE::rpc::Process<false>;
+using ProcBType = LIBC_NAMESPACE::rpc::Process<true>;
 
 static_assert(ProcAType::inbox_offset(port_count) ==
               ProcBType::outbox_offset(port_count));
@@ -26,7 +22,7 @@ static_assert(ProcAType::inbox_offset(port_count) ==
 static_assert(ProcAType::outbox_offset(port_count) ==
               ProcBType::inbox_offset(port_count));
 
-enum { alloc_size = ProcAType::allocation_size(port_count) };
+enum { alloc_size = ProcAType::allocation_size(port_count, 1) };
 
 alignas(64) char buffer[alloc_size] = {0};
 } // namespace
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 4ee81ec1cccf..0d55be5d98bd 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -779,9 +779,7 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexp
-    libc.src.__support.FPUtil.basic_operations
 )
 
 add_fp_unittest(
@@ -793,9 +791,7 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexpf
-    libc.src.__support.FPUtil.basic_operations
 )
 
 add_fp_unittest(
@@ -807,9 +803,19 @@ add_fp_unittest(
   HDRS
     FrexpTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.frexpl
-    libc.src.__support.FPUtil.basic_operations
+)
+
+add_fp_unittest(
+  frexpf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    frexpf128_test.cpp
+  HDRS
+    FrexpTest.h
+  DEPENDS
+    libc.src.math.frexpf128
 )
 
 # FIXME: These tests are currently broken for NVPTX.
@@ -1007,7 +1013,6 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdimf
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
@@ -1022,7 +1027,6 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdim
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
@@ -1037,12 +1041,25 @@ add_fp_unittest(
   HDRS
     FDimTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.fdiml
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fdimf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fdimf128_test.cpp
+  HDRS
+    FDimTest.h
+  DEPENDS
+    libc.src.math.fdimf128
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
+
 # FIXME: These tests are currently broken on the GPU.
 if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   add_fp_unittest(
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index e00b4fd5c42e..5cb3dd117348 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -10,7 +10,6 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
@@ -26,7 +25,7 @@ public:
   const T neg_zero = FPBits::zero(Sign::NEG).get_val();
   const T nan = FPBits::quiet_nan().get_val();
 
-  void test_na_n_arg(FuncPtr func) {
+  void test_nan_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
     EXPECT_FP_EQ(nan, func(neg_inf, nan));
     EXPECT_FP_EQ(nan, func(nan, zero));
@@ -66,12 +65,15 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      FPBits xbits(v), ybits(w);
+      if (xbits.is_inf_or_nan())
         continue;
-      if (isnan(y) || isinf(y))
+      if (ybits.is_inf_or_nan())
         continue;
 
+      T x = xbits.get_val();
+      T y = ybits.get_val();
+
       if (x > y) {
         EXPECT_FP_EQ(x - y, func(x, y));
       } else {
@@ -80,3 +82,12 @@ public:
     }
   }
 };
+
+#define LIST_FDIM_TESTS(T, func)                                               \
+  using LlvmLibcFDimTest = FDimTestTemplate<T>;                                \
+  TEST_F(LlvmLibcFDimTest, NaNArg) { test_nan_arg(&func); }                    \
+  TEST_F(LlvmLibcFDimTest, InfArg) { test_inf_arg(&func); }                    \
+  TEST_F(LlvmLibcFDimTest, NegInfArg) { test_neg_inf_arg(&func); }             \
+  TEST_F(LlvmLibcFDimTest, BothZero) { test_both_zero(&func); }                \
+  TEST_F(LlvmLibcFDimTest, InFloatRange) { test_in_range(&func); }             \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h
index 981872aa128e..bf99a9a559f0 100644
--- a/libc/test/src/math/smoke/FrexpTest.h
+++ b/libc/test/src/math/smoke/FrexpTest.h
@@ -10,81 +10,76 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
-
 template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
 
   DECLARE_SPECIAL_CONSTANTS(T)
 
-  static constexpr StorageType HIDDEN_BIT =
-      StorageType(1) << LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN;
-
 public:
   typedef T (*FrexpFunc)(T, int *);
 
   void testSpecialNumbers(FrexpFunc func) {
     int exponent;
-    ASSERT_FP_EQ(aNaN, func(aNaN, &exponent));
-    ASSERT_FP_EQ(inf, func(inf, &exponent));
-    ASSERT_FP_EQ(neg_inf, func(neg_inf, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, &exponent));
 
-    ASSERT_FP_EQ(0.0, func(0.0, &exponent));
-    ASSERT_EQ(exponent, 0);
+    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, &exponent));
+    EXPECT_EQ(exponent, 0);
 
-    ASSERT_FP_EQ(-0.0, func(-0.0, &exponent));
-    ASSERT_EQ(exponent, 0);
+    EXPECT_FP_EQ_ALL_ROUNDING(-0.0, func(-0.0, &exponent));
+    EXPECT_EQ(exponent, 0);
   }
 
   void testPowersOfTwo(FrexpFunc func) {
     int exponent;
 
-    EXPECT_FP_EQ(T(0.5), func(T(1.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(1.0), &exponent));
     EXPECT_EQ(exponent, 1);
-    EXPECT_FP_EQ(T(-0.5), func(T(-1.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-1.0), &exponent));
     EXPECT_EQ(exponent, 1);
 
-    EXPECT_FP_EQ(T(0.5), func(T(2.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(2.0), &exponent));
     EXPECT_EQ(exponent, 2);
-    EXPECT_FP_EQ(T(-0.5), func(T(-2.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-2.0), &exponent));
     EXPECT_EQ(exponent, 2);
 
-    EXPECT_FP_EQ(T(0.5), func(T(4.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(4.0), &exponent));
     EXPECT_EQ(exponent, 3);
-    EXPECT_FP_EQ(T(-0.5), func(T(-4.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-4.0), &exponent));
     EXPECT_EQ(exponent, 3);
 
-    EXPECT_FP_EQ(T(0.5), func(T(8.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(8.0), &exponent));
     EXPECT_EQ(exponent, 4);
-    EXPECT_FP_EQ(T(-0.5), func(T(-8.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-8.0), &exponent));
     EXPECT_EQ(exponent, 4);
 
-    EXPECT_FP_EQ(T(0.5), func(T(16.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(16.0), &exponent));
     EXPECT_EQ(exponent, 5);
-    EXPECT_FP_EQ(T(-0.5), func(T(-16.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-16.0), &exponent));
     EXPECT_EQ(exponent, 5);
 
-    EXPECT_FP_EQ(T(0.5), func(T(32.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.5), func(T(32.0), &exponent));
     EXPECT_EQ(exponent, 6);
-    EXPECT_FP_EQ(T(-0.5), func(T(-32.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.5), func(T(-32.0), &exponent));
     EXPECT_EQ(exponent, 6);
   }
 
   void testSomeIntegers(FrexpFunc func) {
     int exponent;
 
-    EXPECT_FP_EQ(T(0.75), func(T(24.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.75), func(T(24.0), &exponent));
     EXPECT_EQ(exponent, 5);
-    EXPECT_FP_EQ(T(-0.75), func(T(-24.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.75), func(T(-24.0), &exponent));
     EXPECT_EQ(exponent, 5);
 
-    EXPECT_FP_EQ(T(0.625), func(T(40.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.625), func(T(40.0), &exponent));
     EXPECT_EQ(exponent, 6);
-    EXPECT_FP_EQ(T(-0.625), func(T(-40.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.625), func(T(-40.0), &exponent));
     EXPECT_EQ(exponent, 6);
 
-    EXPECT_FP_EQ(T(0.78125), func(T(800.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.78125), func(T(800.0), &exponent));
     EXPECT_EQ(exponent, 10);
-    EXPECT_FP_EQ(T(-0.78125), func(T(-800.0), &exponent));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.78125), func(T(-800.0), &exponent));
     EXPECT_EQ(exponent, 10);
   }
 };
@@ -93,4 +88,5 @@ public:
   using LlvmLibcFrexpTest = FrexpTest<T>;                                      \
   TEST_F(LlvmLibcFrexpTest, SpecialNumbers) { testSpecialNumbers(&func); }     \
   TEST_F(LlvmLibcFrexpTest, PowersOfTwo) { testPowersOfTwo(&func); }           \
-  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }
+  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }         \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/math/smoke/fdim_test.cpp b/libc/test/src/math/smoke/fdim_test.cpp
index 2f00a30ad1ee..e1c150da90af 100644
--- a/libc/test/src/math/smoke/fdim_test.cpp
+++ b/libc/test/src/math/smoke/fdim_test.cpp
@@ -8,26 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdim.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<double>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdim) { test_na_n_arg(&LIBC_NAMESPACE::fdim); }
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdim) { test_inf_arg(&LIBC_NAMESPACE::fdim); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdim) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdim);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdim) {
-  test_both_zero(&LIBC_NAMESPACE::fdim);
-}
-
-TEST_F(LlvmLibcFDimTest, InDoubleRange_fdim) {
-  test_in_range(&LIBC_NAMESPACE::fdim);
-}
+LIST_FDIM_TESTS(double, LIBC_NAMESPACE::fdim);
diff --git a/libc/test/src/math/smoke/fdimf128_test.cpp b/libc/test/src/math/smoke/fdimf128_test.cpp
new file mode 100644
index 000000000000..8e65c2b4a1ec
--- /dev/null
+++ b/libc/test/src/math/smoke/fdimf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fdimf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FDimTest.h"
+
+#include "src/math/fdimf128.h"
+
+LIST_FDIM_TESTS(float128, LIBC_NAMESPACE::fdimf128);
diff --git a/libc/test/src/math/smoke/fdimf_test.cpp b/libc/test/src/math/smoke/fdimf_test.cpp
index 27511baf25b6..9c27c1d2fc20 100644
--- a/libc/test/src/math/smoke/fdimf_test.cpp
+++ b/libc/test/src/math/smoke/fdimf_test.cpp
@@ -8,28 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdimf.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<float>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdimf) {
-  test_na_n_arg(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdimf) { test_inf_arg(&LIBC_NAMESPACE::fdimf); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdimf) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdimf) {
-  test_both_zero(&LIBC_NAMESPACE::fdimf);
-}
-
-TEST_F(LlvmLibcFDimTest, InFloatRange_fdimf) {
-  test_in_range(&LIBC_NAMESPACE::fdimf);
-}
+LIST_FDIM_TESTS(float, LIBC_NAMESPACE::fdimf);
diff --git a/libc/test/src/math/smoke/fdiml_test.cpp b/libc/test/src/math/smoke/fdiml_test.cpp
index 45aedb0a1cde..ed448a612ec7 100644
--- a/libc/test/src/math/smoke/fdiml_test.cpp
+++ b/libc/test/src/math/smoke/fdiml_test.cpp
@@ -8,28 +8,6 @@
 
 #include "FDimTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdiml.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-using LlvmLibcFDimTest = FDimTestTemplate<long double>;
-
-TEST_F(LlvmLibcFDimTest, NaNArg_fdiml) {
-  test_na_n_arg(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, InfArg_fdiml) { test_inf_arg(&LIBC_NAMESPACE::fdiml); }
-
-TEST_F(LlvmLibcFDimTest, NegInfArg_fdiml) {
-  test_neg_inf_arg(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, BothZero_fdiml) {
-  test_both_zero(&LIBC_NAMESPACE::fdiml);
-}
-
-TEST_F(LlvmLibcFDimTest, InLongDoubleRange_fdiml) {
-  test_in_range(&LIBC_NAMESPACE::fdiml);
-}
+LIST_FDIM_TESTS(long double, LIBC_NAMESPACE::fdiml);
diff --git a/libc/test/src/math/smoke/frexp_test.cpp b/libc/test/src/math/smoke/frexp_test.cpp
index 4d078baffcb3..79aa9723d314 100644
--- a/libc/test/src/math/smoke/frexp_test.cpp
+++ b/libc/test/src/math/smoke/frexp_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexp.h"
 
-LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp)
+LIST_FREXP_TESTS(double, LIBC_NAMESPACE::frexp);
diff --git a/libc/test/src/math/smoke/frexpf128_test.cpp b/libc/test/src/math/smoke/frexpf128_test.cpp
new file mode 100644
index 000000000000..a0df32f5fbdd
--- /dev/null
+++ b/libc/test/src/math/smoke/frexpf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for frexpf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FrexpTest.h"
+
+#include "src/math/frexpf128.h"
+
+LIST_FREXP_TESTS(float128, LIBC_NAMESPACE::frexpf128);
diff --git a/libc/test/src/math/smoke/frexpf_test.cpp b/libc/test/src/math/smoke/frexpf_test.cpp
index 577eb9609cfc..f2ae637e9a6c 100644
--- a/libc/test/src/math/smoke/frexpf_test.cpp
+++ b/libc/test/src/math/smoke/frexpf_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexpf.h"
 
-LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf)
+LIST_FREXP_TESTS(float, LIBC_NAMESPACE::frexpf);
diff --git a/libc/test/src/math/smoke/frexpl_test.cpp b/libc/test/src/math/smoke/frexpl_test.cpp
index e5184cd225bc..3e1f8b4204e6 100644
--- a/libc/test/src/math/smoke/frexpl_test.cpp
+++ b/libc/test/src/math/smoke/frexpl_test.cpp
@@ -10,4 +10,4 @@
 
 #include "src/math/frexpl.h"
 
-LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl)
+LIST_FREXP_TESTS(long double, LIBC_NAMESPACE::frexpl);
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index fab5f67bbd9b..bc7e49d186a0 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -5,6 +5,7 @@ set(prefixes
   leading_ones
   trailing_zeros
   trailing_ones
+  first_leading_zero
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp
new file mode 100644
index 000000000000..ac7e8c7d9e64
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc(UCHAR_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ZeroHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc(~(1U << i)),
+              UCHAR_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp
new file mode 100644
index 000000000000..79a4e5394349
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUiTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ui(UINT_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUiTest, ZeroHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ui(~(1U << i)),
+              UINT_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp
new file mode 100644
index 000000000000..92cac6c8bffd
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUlTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ul(ULONG_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUlTest, ZeroHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ul(~(1UL << i)),
+              ULONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp
new file mode 100644
index 000000000000..a5afdad591ae
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUllTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ull(ULLONG_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUllTest, ZeroHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_ull(~(1ULL << i)),
+              ULLONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp
new file mode 100644
index 000000000000..37f8612675a7
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_first_leading_zero_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_first_leading_zero_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ALL) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us(USHRT_MAX), 0U);
+}
+
+TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ZeroHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us(~(1U << i)),
+              USHRT_WIDTH - i);
+}
diff --git a/libc/test/src/sys/socket/linux/bind_test.cpp b/libc/test/src/sys/socket/linux/bind_test.cpp
index 305e4889f394..e70cbd578290 100644
--- a/libc/test/src/sys/socket/linux/bind_test.cpp
+++ b/libc/test/src/sys/socket/linux/bind_test.cpp
@@ -13,7 +13,6 @@
 #include "src/unistd/close.h"
 
 #include "src/errno/libc_errno.h"
-#include "test/UnitTest/LibcTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/socket.h> // For AF_UNIX and SOCK_DGRAM
diff --git a/libc/utils/HdrGen/PublicAPICommand.cpp b/libc/utils/HdrGen/PublicAPICommand.cpp
index b1c7a072658f..cf6984be74d1 100644
--- a/libc/utils/HdrGen/PublicAPICommand.cpp
+++ b/libc/utils/HdrGen/PublicAPICommand.cpp
@@ -102,6 +102,14 @@ void writeAPIFromIndex(APIIndexer &G,
     llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return");
     llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType");
 
+    // TODO: https://github.com/llvm/llvm-project/issues/81208
+    //   Ideally, we should group functions based on their guarding macros.
+    bool Guarded =
+        (FunctionSpec->getType()->getAsString() == "GuardedFunctionSpec");
+
+    if (Guarded)
+      OS << "#ifdef " << FunctionSpec->getValueAsString("Guard") << "\n";
+
     OS << G.getTypeAsString(ReturnType) << " " << Name << "(";
 
     auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args");
@@ -112,7 +120,12 @@ void writeAPIFromIndex(APIIndexer &G,
         OS << ", ";
     }
 
-    OS << ") __NOEXCEPT;\n\n";
+    OS << ") __NOEXCEPT;\n";
+
+    if (Guarded)
+      OS << "#endif // " << FunctionSpec->getValueAsString("Guard") << "\n";
+
+    OS << "\n";
   }
 
   // Make another pass over entrypoints to emit object declarations.
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index a2e5d0fd5a83..4e535a294a19 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -396,62 +396,42 @@ const void *rpc_get_client_buffer(uint32_t device_id) {
 
 uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
 
-using ServerPort = std::variant<rpc::Server<1>::Port *, rpc::Server<32>::Port *,
-                                rpc::Server<64>::Port *>;
+using ServerPort = std::variant<rpc::Server<0>::Port *>;
 
 ServerPort get_port(rpc_port_t ref) {
-  if (ref.lane_size == 1)
-    return reinterpret_cast<rpc::Server<1>::Port *>(ref.handle);
-  else if (ref.lane_size == 32)
-    return reinterpret_cast<rpc::Server<32>::Port *>(ref.handle);
-  else if (ref.lane_size == 64)
-    return reinterpret_cast<rpc::Server<64>::Port *>(ref.handle);
-  else
-    __builtin_unreachable();
+  return reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
 }
 
 void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->send([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->send([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
 
 void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
-  auto port = get_port(ref);
-  std::visit([=](auto &port) { port->send_n(src, size); }, port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->send_n(src, size);
 }
 
 void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->recv([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->recv([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
 
 void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
                 void *data) {
-  auto port = get_port(ref);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
   auto alloc_fn = [=](uint64_t size) { return alloc(size, data); };
-  std::visit([=](auto &port) { port->recv_n(dst, size, alloc_fn); }, port);
+  port->recv_n(dst, size, alloc_fn);
 }
 
 void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
                        void *data) {
-  auto port = get_port(ref);
-  std::visit(
-      [=](auto &port) {
-        port->recv_and_send([=](rpc::Buffer *buffer) {
-          callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-        });
-      },
-      port);
+  auto port = reinterpret_cast<rpc::Server<0>::Port *>(ref.handle);
+  port->recv_and_send([=](rpc::Buffer *buffer) {
+    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
+  });
 }
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index a5c6fa22cec0..468226c0c2dd 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -24,7 +24,7 @@ Status
     =================================================== =================
     Macro Name                                          Value
     =================================================== =================
-    **C++ 14**
+    **C++14**
     ---------------------------------------------------------------------
     ``__cpp_lib_chrono_udls``                           ``201304L``
     --------------------------------------------------- -----------------
@@ -66,7 +66,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_tuples_by_type``                        ``201304L``
     --------------------------------------------------- -----------------
-    **C++ 17**
+    **C++17**
     ---------------------------------------------------------------------
     ``__cpp_lib_addressof_constexpr``                   ``201603L``
     --------------------------------------------------- -----------------
@@ -166,7 +166,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_void_t``                                ``201411L``
     --------------------------------------------------- -----------------
-    **C++ 20**
+    **C++20**
     ---------------------------------------------------------------------
     ``__cpp_lib_array_constexpr``                       ``201811L``
     --------------------------------------------------- -----------------
@@ -300,7 +300,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_unwrap_ref``                            ``201811L``
     --------------------------------------------------- -----------------
-    **C++ 23**
+    **C++23**
     ---------------------------------------------------------------------
     ``__cpp_lib_adaptor_iterator_pair_constructor``     ``202106L``
     --------------------------------------------------- -----------------
@@ -388,7 +388,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_unreachable``                           ``202202L``
     --------------------------------------------------- -----------------
-    **C++ 26**
+    **C++26**
     ---------------------------------------------------------------------
     ``__cpp_lib_associative_heterogeneous_insertion``   *unimplemented*
     --------------------------------------------------- -----------------
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 533c3fbd2a1e..ee2b81d3b9e7 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -218,9 +218,13 @@ Building this project is done with the following steps, assuming the files
 
   $ mkdir build
   $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER=<path-to-compiler> -DLIBCXX_BUILD=<build>
+  $ ninja -j1 std -C build
   $ ninja -C build
   $ build/main
 
+.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when
+          building the ``std`` target using multiple jobs.
+
 .. warning:: ``<path-to-compiler>`` should point point to the real binary and
              not to a symlink.
 
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 316127f44e0d..f0e9c4090f9c 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -192,7 +192,7 @@
 "`1203 <https://wg21.link/LWG1203>`__","More useful rvalue stream insertion","Prague","|Complete|","12.0"
 "`2859 <https://wg21.link/LWG2859>`__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","Prague","",""
 "`3018 <https://wg21.link/LWG3018>`__","``shared_ptr``\  of function type","Prague","",""
-"`3050 <https://wg21.link/LWG3050>`__","Conversion specification problem in ``chrono::duration``\  constructor","Prague","","","|chrono|"
+"`3050 <https://wg21.link/LWG3050>`__","Conversion specification problem in ``chrono::duration``\  constructor","Prague","|Complete|","19.0","|chrono|"
 "`3141 <https://wg21.link/LWG3141>`__","``CopyConstructible``\  doesn't preserve source values","Prague","|Nothing to do|",""
 "`3150 <https://wg21.link/LWG3150>`__","``UniformRandomBitGenerator``\  should validate ``min``\  and ``max``\ ","Prague","|Complete|","13.0","|ranges|"
 "`3175 <https://wg21.link/LWG3175>`__","The ``CommonReference``\  requirement of concept ``SwappableWith``\  is not satisfied in the example","Prague","|Complete|","13.0"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 6ded426640f0..d55dc66a91bc 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -427,7 +427,6 @@ set(files
   __fwd/array.h
   __fwd/bit_reference.h
   __fwd/fstream.h
-  __fwd/get.h
   __fwd/hash.h
   __fwd/ios.h
   __fwd/istream.h
diff --git a/libcxx/include/__chrono/duration.h b/libcxx/include/__chrono/duration.h
index 5693ee644091..1e81420244b1 100644
--- a/libcxx/include/__chrono/duration.h
+++ b/libcxx/include/__chrono/duration.h
@@ -412,7 +412,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR
 template <class _Rep1,
           class _Period,
           class _Rep2,
-          __enable_if_t<is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
+          __enable_if_t<is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
   typedef typename common_type<_Rep1, _Rep2>::type _Cr;
@@ -423,7 +423,7 @@ operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
 template <class _Rep1,
           class _Period,
           class _Rep2,
-          __enable_if_t<is_convertible<_Rep1, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
+          __enable_if_t<is_convertible<const _Rep1&, typename common_type<_Rep1, _Rep2>::type>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d) {
   return __d * __s;
@@ -435,7 +435,7 @@ template <class _Rep1,
           class _Period,
           class _Rep2,
           __enable_if_t<!__is_duration<_Rep2>::value &&
-                            is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
+                            is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
@@ -457,7 +457,7 @@ template <class _Rep1,
           class _Period,
           class _Rep2,
           __enable_if_t<!__is_duration<_Rep2>::value &&
-                            is_convertible<_Rep2, typename common_type<_Rep1, _Rep2>::type>::value,
+                            is_convertible<const _Rep2&, typename common_type<_Rep1, _Rep2>::type>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration<typename common_type<_Rep1, _Rep2>::type, _Period>
 operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s) {
diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h
index 5f6ade5aef8e..3dc819e64251 100644
--- a/libcxx/include/__compare/strong_order.h
+++ b/libcxx/include/__compare/strong_order.h
@@ -13,11 +13,14 @@
 #include <__compare/compare_three_way.h>
 #include <__compare/ordering.h>
 #include <__config>
+#include <__math/exponential_functions.h>
+#include <__math/traits.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_floating_point.h>
+#include <__type_traits/is_same.h>
 #include <__utility/forward.h>
 #include <__utility/priority_tag.h>
-#include <cmath>
 #include <cstdint>
 #include <limits>
 
@@ -66,27 +69,27 @@ struct __fn {
       return strong_ordering::greater;
     } else if (__t == __u) {
       if constexpr (numeric_limits<_Dp>::radix == 2) {
-        return std::signbit(__u) <=> std::signbit(__t);
+        return __math::signbit(__u) <=> __math::signbit(__t);
       } else {
         // This is bullet 3 of the IEEE754 algorithm, relevant
         // only for decimal floating-point;
         // see https://stackoverflow.com/questions/69068075/
-        if (__t == 0 || std::isinf(__t)) {
-          return std::signbit(__u) <=> std::signbit(__t);
+        if (__t == 0 || __math::isinf(__t)) {
+          return __math::signbit(__u) <=> __math::signbit(__t);
         } else {
           int __texp, __uexp;
-          (void)std::frexp(__t, &__texp);
-          (void)std::frexp(__u, &__uexp);
+          (void)__math::frexp(__t, &__texp);
+          (void)__math::frexp(__u, &__uexp);
           return (__t < 0) ? (__texp <=> __uexp) : (__uexp <=> __texp);
         }
       }
     } else {
       // They're unordered, so one of them must be a NAN.
       // The order is -QNAN, -SNAN, numbers, +SNAN, +QNAN.
-      bool __t_is_nan      = std::isnan(__t);
-      bool __u_is_nan      = std::isnan(__u);
-      bool __t_is_negative = std::signbit(__t);
-      bool __u_is_negative = std::signbit(__u);
+      bool __t_is_nan      = __math::isnan(__t);
+      bool __u_is_nan      = __math::isnan(__u);
+      bool __t_is_negative = __math::signbit(__t);
+      bool __u_is_negative = __math::signbit(__u);
       using _IntType =
           conditional_t< sizeof(__t) == sizeof(int32_t),
                          int32_t,
diff --git a/libcxx/include/__compare/weak_order.h b/libcxx/include/__compare/weak_order.h
index 9f719eb64bbc..b82a708c29a1 100644
--- a/libcxx/include/__compare/weak_order.h
+++ b/libcxx/include/__compare/weak_order.h
@@ -13,10 +13,12 @@
 #include <__compare/ordering.h>
 #include <__compare/strong_order.h>
 #include <__config>
+#include <__math/traits.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_floating_point.h>
+#include <__type_traits/is_same.h>
 #include <__utility/forward.h>
 #include <__utility/priority_tag.h>
-#include <cmath>
 
 #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
 #  pragma GCC system_header
@@ -51,10 +53,10 @@ struct __fn {
       return weak_ordering::greater;
     } else {
       // Otherwise, at least one of them is a NaN.
-      bool __t_is_nan      = std::isnan(__t);
-      bool __u_is_nan      = std::isnan(__u);
-      bool __t_is_negative = std::signbit(__t);
-      bool __u_is_negative = std::signbit(__u);
+      bool __t_is_nan      = __math::isnan(__t);
+      bool __u_is_nan      = __math::isnan(__u);
+      bool __t_is_negative = __math::signbit(__t);
+      bool __u_is_negative = __math::signbit(__u);
       if (__t_is_nan && __u_is_nan) {
         return (__u_is_negative <=> __t_is_negative);
       } else if (__t_is_nan) {
diff --git a/libcxx/include/__fwd/array.h b/libcxx/include/__fwd/array.h
index 9a79effb617d..ff3a3eeeefc7 100644
--- a/libcxx/include/__fwd/array.h
+++ b/libcxx/include/__fwd/array.h
@@ -21,6 +21,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS array;
 
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>&) _NOEXCEPT;
+
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT;
+
+#ifndef _LIBCPP_CXX03_LANG
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&&) _NOEXCEPT;
+
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_ARRAY_H
diff --git a/libcxx/include/__fwd/get.h b/libcxx/include/__fwd/get.h
deleted file mode 100644
index e7261b826953..000000000000
--- a/libcxx/include/__fwd/get.h
+++ /dev/null
@@ -1,99 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___FWD_GET_H
-#define _LIBCPP___FWD_GET_H
-
-#include <__concepts/copyable.h>
-#include <__config>
-#include <__fwd/array.h>
-#include <__fwd/pair.h>
-#include <__fwd/subrange.h>
-#include <__fwd/tuple.h>
-#include <__tuple/tuple_element.h>
-#include <cstddef>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#ifndef _LIBCPP_CXX03_LANG
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&
-get(tuple<_Tp...>&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&
-get(const tuple<_Tp...>&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&&
-get(tuple<_Tp...>&&) _NOEXCEPT;
-
-template <size_t _Ip, class... _Tp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&&
-get(const tuple<_Tp...>&&) _NOEXCEPT;
-
-#endif //_LIBCPP_CXX03_LANG
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&
-get(pair<_T1, _T2>&) _NOEXCEPT;
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&
-get(const pair<_T1, _T2>&) _NOEXCEPT;
-
-#ifndef _LIBCPP_CXX03_LANG
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
-get(pair<_T1, _T2>&&) _NOEXCEPT;
-
-template <size_t _Ip, class _T1, class _T2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
-get(const pair<_T1, _T2>&&) _NOEXCEPT;
-#endif
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>&) _NOEXCEPT;
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT;
-
-#ifndef _LIBCPP_CXX03_LANG
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&&) _NOEXCEPT;
-
-template <size_t _Ip, class _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) _NOEXCEPT;
-#endif
-
-#if _LIBCPP_STD_VER >= 20
-
-namespace ranges {
-
-template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
-  requires((_Index == 0 && copyable<_Iter>) || _Index == 1)
-_LIBCPP_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>& __subrange);
-
-template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
-  requires(_Index < 2)
-_LIBCPP_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&& __subrange);
-
-} // namespace ranges
-
-using ranges::get;
-
-#endif // _LIBCPP_STD_VER >= 20
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___FWD_GET_H
diff --git a/libcxx/include/__fwd/pair.h b/libcxx/include/__fwd/pair.h
index 3844014de3ad..af32628fe1e0 100644
--- a/libcxx/include/__fwd/pair.h
+++ b/libcxx/include/__fwd/pair.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___FWD_PAIR_H
 
 #include <__config>
+#include <__fwd/tuple.h>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -20,6 +22,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class, class>
 struct _LIBCPP_TEMPLATE_VIS pair;
 
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&
+get(pair<_T1, _T2>&) _NOEXCEPT;
+
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&
+get(const pair<_T1, _T2>&) _NOEXCEPT;
+
+#ifndef _LIBCPP_CXX03_LANG
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
+get(pair<_T1, _T2>&&) _NOEXCEPT;
+
+template <size_t _Ip, class _T1, class _T2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&&
+get(const pair<_T1, _T2>&&) _NOEXCEPT;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_PAIR_H
diff --git a/libcxx/include/__fwd/subrange.h b/libcxx/include/__fwd/subrange.h
index d09b9b1c5b97..60a41da23dd4 100644
--- a/libcxx/include/__fwd/subrange.h
+++ b/libcxx/include/__fwd/subrange.h
@@ -9,7 +9,10 @@
 #ifndef _LIBCPP___FWD_SUBRANGE_H
 #define _LIBCPP___FWD_SUBRANGE_H
 
+#include <__concepts/copyable.h>
 #include <__config>
+#include <__iterator/concepts.h>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -17,8 +20,6 @@
 
 #if _LIBCPP_STD_VER >= 20
 
-#  include <__iterator/concepts.h>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace ranges {
@@ -29,8 +30,18 @@ template <input_or_output_iterator _Iter, sentinel_for<_Iter> _Sent, subrange_ki
   requires(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>)
 class _LIBCPP_TEMPLATE_VIS subrange;
 
+template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
+  requires((_Index == 0 && copyable<_Iter>) || _Index == 1)
+_LIBCPP_HIDE_FROM_ABI constexpr auto get(const subrange<_Iter, _Sent, _Kind>&);
+
+template <size_t _Index, class _Iter, class _Sent, subrange_kind _Kind>
+  requires(_Index < 2)
+_LIBCPP_HIDE_FROM_ABI constexpr auto get(subrange<_Iter, _Sent, _Kind>&&);
+
 } // namespace ranges
 
+using ranges::get;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__fwd/tuple.h b/libcxx/include/__fwd/tuple.h
index 16b3fabbb995..902770c29555 100644
--- a/libcxx/include/__fwd/tuple.h
+++ b/libcxx/include/__fwd/tuple.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___FWD_TUPLE_H
 
 #include <__config>
+#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -17,11 +18,33 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <size_t, class>
+struct _LIBCPP_TEMPLATE_VIS tuple_element;
+
 #ifndef _LIBCPP_CXX03_LANG
 
 template <class...>
 class _LIBCPP_TEMPLATE_VIS tuple;
 
+template <class>
+struct _LIBCPP_TEMPLATE_VIS tuple_size;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&
+get(tuple<_Tp...>&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&
+get(const tuple<_Tp...>&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, tuple<_Tp...> >::type&&
+get(tuple<_Tp...>&&) _NOEXCEPT;
+
+template <size_t _Ip, class... _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, tuple<_Tp...> >::type&&
+get(const tuple<_Tp...>&&) _NOEXCEPT;
+
 #endif // _LIBCPP_CXX03_LANG
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__locale_dir/locale_base_api/ibm.h
index 498ea1ecfeda..c5d7f34186b1 100644
--- a/libcxx/include/__locale_dir/locale_base_api/ibm.h
+++ b/libcxx/include/__locale_dir/locale_base_api/ibm.h
@@ -100,7 +100,7 @@ inline _LIBCPP_HIDE_FROM_ABI int vasprintf(char** strp, const char* fmt, va_list
   }
 
   va_list ap_copy;
-  // va_copy may not be provided by the C library in C++ 03 mode.
+  // va_copy may not be provided by the C library in C++03 mode.
 #if defined(_LIBCPP_CXX03_LANG) && __has_builtin(__builtin_va_copy)
   __builtin_va_copy(ap_copy, ap);
 #else
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index e9faada2f0b2..a28f8525f6ae 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___MEMORY_COMPRESSED_PAIR_H
 
 #include <__config>
-#include <__fwd/get.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_indices.h>
 #include <__type_traits/decay.h>
diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h
index 989d36fbcaaa..3f35e93b3071 100644
--- a/libcxx/include/__ranges/elements_view.h
+++ b/libcxx/include/__ranges/elements_view.h
@@ -16,7 +16,6 @@
 #include <__concepts/derived_from.h>
 #include <__concepts/equality_comparable.h>
 #include <__config>
-#include <__fwd/get.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__ranges/access.h>
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a41978275b78..bb4411cf3554 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -17,7 +17,6 @@
 #include <__concepts/derived_from.h>
 #include <__concepts/different_from.h>
 #include <__config>
-#include <__fwd/get.h>
 #include <__fwd/subrange.h>
 #include <__iterator/advance.h>
 #include <__iterator/concepts.h>
diff --git a/libcxx/include/__tuple/tuple_like.h b/libcxx/include/__tuple/tuple_like.h
index dab395be616b..3548af971ce2 100644
--- a/libcxx/include/__tuple/tuple_like.h
+++ b/libcxx/include/__tuple/tuple_like.h
@@ -16,7 +16,6 @@
 #include <__fwd/tuple.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/remove_cvref.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 577e23198812..0db6f7f60213 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -14,7 +14,6 @@
 #include <__concepts/different_from.h>
 #include <__config>
 #include <__fwd/array.h>
-#include <__fwd/get.h>
 #include <__fwd/pair.h>
 #include <__fwd/subrange.h>
 #include <__fwd/tuple.h>
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index c80fa78a56ba..f8407419c954 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -58,7 +58,7 @@ public:
         constexpr explicit duration(const Rep2& r,
             typename enable_if
             <
-               is_convertible<Rep2, rep>::value &&
+               is_convertible<const Rep2&, rep>::value &&
                (treat_as_floating_point<rep>::value ||
                !treat_as_floating_point<rep>::value && !treat_as_floating_point<Rep2>::value)
             >::type* = 0);
diff --git a/libcxx/include/compare b/libcxx/include/compare
index 626c7435f5fd..cc0cae8a544d 100644
--- a/libcxx/include/compare
+++ b/libcxx/include/compare
@@ -162,6 +162,7 @@ namespace std {
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cmath>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 1c7fb992dff4..0d05a26091a4 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -167,6 +167,7 @@ template <class Stream, class T>
 #include <__type_traits/is_base_of.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
+#include <bitset>
 #include <ostream>
 #include <version>
 
diff --git a/libcxx/include/limits b/libcxx/include/limits
index a240580c0132..c704b4dddaf8 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -436,8 +436,8 @@ protected:
 };
 
 template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<__remove_cv_t<_Tp> > {
-  typedef __libcpp_numeric_limits<__remove_cv_t<_Tp> > __base;
+class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp> {
+  typedef __libcpp_numeric_limits<_Tp> __base;
   typedef typename __base::type type;
 
 public:
@@ -530,286 +530,13 @@ template <class _Tp>
 _LIBCPP_CONSTEXPR const float_round_style numeric_limits<_Tp>::round_style;
 
 template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<const _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
+class _LIBCPP_TEMPLATE_VIS numeric_limits<const _Tp> : public numeric_limits<_Tp> {};
 
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
-
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::radix;
 template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<const _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const _Tp>::tinyness_before;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<const _Tp>::round_style;
-
-template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<volatile _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
-
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
+class _LIBCPP_TEMPLATE_VIS numeric_limits<volatile _Tp> : public numeric_limits<_Tp> {};
 
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::radix;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<volatile _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<volatile _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<volatile _Tp>::tinyness_before;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<volatile _Tp>::round_style;
-
-template <class _Tp>
-class _LIBCPP_TEMPLATE_VIS numeric_limits<const volatile _Tp> : private numeric_limits<_Tp> {
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
-
-  static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
-  static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
-  static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10;
-  static _LIBCPP_CONSTEXPR const bool is_signed   = __base::is_signed;
-  static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
-  static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
-  static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
-
-  static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
-  static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
-  static _LIBCPP_CONSTEXPR const int max_exponent   = __base::max_exponent;
-  static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10;
-
-  static _LIBCPP_CONSTEXPR const bool has_infinity      = __base::has_infinity;
-  static _LIBCPP_CONSTEXPR const bool has_quiet_NaN     = __base::has_quiet_NaN;
-  static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-  static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
-
-  static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
-  static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
-  static _LIBCPP_CONSTEXPR const bool is_modulo  = __base::is_modulo;
-
-  static _LIBCPP_CONSTEXPR const bool traps                    = __base::traps;
-  static _LIBCPP_CONSTEXPR const bool tinyness_before          = __base::tinyness_before;
-  static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_specialized;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::digits;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_digits10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_signed;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_integer;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_exact;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::radix;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::min_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::min_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_exponent;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const int numeric_limits<const volatile _Tp>::max_exponent10;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_infinity;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<const volatile _Tp>::has_denorm;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_iec559;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_bounded;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::is_modulo;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::traps;
-template <class _Tp>
-_LIBCPP_CONSTEXPR const bool numeric_limits<const volatile _Tp>::tinyness_before;
 template <class _Tp>
-_LIBCPP_CONSTEXPR const float_round_style numeric_limits<const volatile _Tp>::round_style;
+class _LIBCPP_TEMPLATE_VIS numeric_limits<const volatile _Tp> : public numeric_limits<_Tp> {};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 9828c48e0587..f3246f808b9d 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1790,7 +1790,6 @@ module std_private_thread_thread               [system] {
 }
 module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" }
 
-module std_private_tuple_get_fwd          [system] { header "__fwd/get.h" }
 module std_private_tuple_make_tuple_types [system] { header "__tuple/make_tuple_types.h" }
 module std_private_tuple_pair_like        [system] {
   header "__tuple/pair_like.h"
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index 180adda201d8..2e2607340a5d 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -1090,11 +1090,10 @@ _LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os);
 
 #  ifndef _LIBCPP_HAS_NO_UNICODE
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI void
-__vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
-#if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+_LIBCPP_HIDE_FROM_ABI void __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
+#    if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
   return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
-#else
+#    else
   FILE* __file = std::__get_ostream_file(__os);
   if (!__file || !__print::__is_terminal(__file))
     return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
@@ -1110,38 +1109,36 @@ __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __wr
   // This is the path for the native API, start with flushing.
   __os.flush();
 
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
     ostream::sentry __s(__os);
     if (__s) {
-#    ifndef _LIBCPP_WIN32API
+#      ifndef _LIBCPP_WIN32API
       __print::__vprint_unicode_posix(__file, __fmt, __args, __write_nl, true);
-#    elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#      elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
     __print::__vprint_unicode_windows(__file, __fmt, __args, __write_nl, true);
-#    else
-#      error "Windows builds with wchar_t disabled are not supported."
-#    endif
+#      else
+#        error "Windows builds with wchar_t disabled are not supported."
+#      endif
     }
 
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
     __os.__set_badbit_and_consider_rethrow();
   }
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
-#endif // _LIBCPP_AVAILABILITY_HAS_PRINT
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
+#    endif   // _LIBCPP_AVAILABILITY_HAS_PRINT
 }
 
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void
-vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
+_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
   std::__vprint_unicode(__os, __fmt, __args, false);
 }
 #  endif // _LIBCPP_HAS_NO_UNICODE
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void
-print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 #  ifndef _LIBCPP_HAS_NO_UNICODE
   if constexpr (__print::__use_unicode_execution_charset)
     std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), false);
@@ -1153,8 +1150,7 @@ print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 }
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void
-println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
 #  ifndef _LIBCPP_HAS_NO_UNICODE
   // Note the wording in the Standard is inefficient. The output of
   // std::format is a std::string which is then copied. This solution
diff --git a/libcxx/include/print b/libcxx/include/print
index 7f2b5bac3dcf..543a540ee4f2 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -32,6 +32,7 @@ namespace std {
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__system_error/system_error.h>
@@ -43,10 +44,6 @@ namespace std {
 #include <string_view>
 #include <version>
 
-#if __has_include(<unistd.h>)
-#  include <unistd.h>
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -68,7 +65,8 @@ _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream);
 // Note the function is only implemented on the Windows platform.
 _LIBCPP_EXPORTED_FROM_ABI void __write_to_windows_console(FILE* __stream, wstring_view __view);
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-
+#elif __has_include(<unistd.h>)
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream);
 #endif // _LIBCPP_WIN32API
 
 #if _LIBCPP_STD_VER >= 23
@@ -195,15 +193,17 @@ inline constexpr bool __use_unicode_execution_charset = _MSVC_EXECUTION_CHARACTE
 inline constexpr bool __use_unicode_execution_charset = true;
 #  endif
 
-_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal(FILE* __stream) {
+_LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) {
   // The macro _LIBCPP_TESTING_PRINT_IS_TERMINAL is used to change
   // the behavior in the test. This is not part of the public API.
 #  ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL
   return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream);
+#  elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+  return false;
 #  elif defined(_LIBCPP_WIN32API)
   return std::__is_windows_terminal(__stream);
 #  elif __has_include(<unistd.h>)
-  return isatty(fileno(__stream));
+  return std::__is_posix_terminal(__stream);
 #  else
 #    error "Provide a way to determine whether a FILE* is a terminal"
 #  endif
diff --git a/libcxx/include/ratio b/libcxx/include/ratio
index 3b11a2aa5bf6..de656f38e01d 100644
--- a/libcxx/include/ratio
+++ b/libcxx/include/ratio
@@ -289,6 +289,9 @@ private:
   static const intmax_t __gcd_n1_d2 = __static_gcd<_R1::num, _R2::den>::value;
   static const intmax_t __gcd_d1_n2 = __static_gcd<_R1::den, _R2::num>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_d2, _R2::num / __gcd_d1_n2>::value,
                           __ll_mul<_R2::den / __gcd_n1_d2, _R1::den / __gcd_d1_n2>::value >::type type;
@@ -312,6 +315,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio< __ll_mul<_R1::num / __gcd_n1_n2, _R2::den / __gcd_d1_d2>::value,
                           __ll_mul<_R2::num / __gcd_n1_n2, _R1::den / __gcd_d1_d2>::value >::type type;
@@ -335,6 +341,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio_multiply<
       ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
@@ -361,6 +370,9 @@ private:
   static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
   static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
 
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+
 public:
   typedef typename ratio_multiply<
       ratio<__gcd_n1_n2, _R1::den / __gcd_d1_d2>,
@@ -384,10 +396,16 @@ struct _LIBCPP_TEMPLATE_VIS ratio_subtract : public __ratio_subtract<_R1, _R2>::
 // ratio_equal
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_equal : _BoolConstant<(_R1::num == _R2::num && _R1::den == _R2::den)> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant<!ratio_equal<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_not_equal : _BoolConstant<!ratio_equal<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 // ratio_less
 
@@ -441,16 +459,28 @@ struct __ratio_less<_R1, _R2, -1LL, -1LL> {
 };
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_less : _BoolConstant<__ratio_less<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant<!ratio_less<_R2, _R1>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_less_equal : _BoolConstant<!ratio_less<_R2, _R1>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant<ratio_less<_R2, _R1>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_greater : _BoolConstant<ratio_less<_R2, _R1>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
-struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1, _R2>::value> {};
+struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1, _R2>::value> {
+  static_assert(__is_ratio<_R1>::value, "[ratio.general]/2 requires R1 to be a specialisation of the ratio template");
+  static_assert(__is_ratio<_R2>::value, "[ratio.general]/2 requires R2 to be a specialisation of the ratio template");
+};
 
 template <class _R1, class _R2>
 struct __ratio_gcd {
diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index 1626453a698f..eff6fbdf6edd 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -476,8 +476,8 @@ public:
   }
 
 private:
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT
-      : base(std::move(__o), std::move(__i)) {}
+  _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(
+      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : base(std::move(__o), std::move(__i)) {}
 
   template <class _Tp, class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __construct(integral_constant<int, 0>, _Tp* __p, _Args&&... __args) {
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index ac66b3a568bf..57f385b5435e 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -124,9 +124,9 @@ template <class Mutex>
 
 #include <__config>
 
-#  ifdef _LIBCPP_HAS_NO_THREADS
-#    error "<shared_mutex> is not supported since libc++ has been configured without support for threads."
-#  endif
+#ifdef _LIBCPP_HAS_NO_THREADS
+#  error "<shared_mutex> is not supported since libc++ has been configured without support for threads."
+#endif
 
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__availability>
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 6c354cf0b397..8862e2ef99f8 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -48,10 +48,12 @@ public:
     template <class SAlloc>
     explicit basic_stringbuf(const basic_string<char_type, traits_type, SAlloc>& s,
                              ios_base::openmode which = ios_base::in | ios_base::out); // C++20
+    basic_stringbuf(const basic_stringbuf&) = delete;
     basic_stringbuf(basic_stringbuf&& rhs);
     basic_stringbuf(basic_stringbuf&& rhs, const allocator_type& a);                   // C++20
 
     // [stringbuf.assign] Assign and swap:
+    basic_stringbuf& operator=(const basic_stringbuf&) = delete;
     basic_stringbuf& operator=(basic_stringbuf&& rhs);
     void swap(basic_stringbuf& rhs) noexcept(see below);                               // conditionally noexcept since C++20
 
@@ -119,9 +121,11 @@ public:
     template <class SAlloc>
     explicit basic_istringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                  ios_base::openmode which = ios_base::in);             // C++20
+    basic_istringstream(const basic_istringstream&) = delete;
     basic_istringstream(basic_istringstream&& rhs);
 
     // [istringstream.assign] Assign and swap:
+    basic_istringstream& operator=(const basic_istringstream&) = delete;
     basic_istringstream& operator=(basic_istringstream&& rhs);
     void swap(basic_istringstream& rhs);
 
@@ -178,9 +182,11 @@ public:
     template <class SAlloc>
     explicit basic_ostringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                  ios_base::openmode which = ios_base::out);            // C++20
+    basic_ostringstream(const basic_ostringstream&) = delete;                             
     basic_ostringstream(basic_ostringstream&& rhs);
 
     // [ostringstream.assign] Assign and swap:
+    basic_ostringstream& operator=(const basic_ostringstream&) = delete;
     basic_ostringstream& operator=(basic_ostringstream&& rhs);
     void swap(basic_ostringstream& rhs);
 
@@ -237,9 +243,11 @@ public:
     template <class SAlloc>
     explicit basic_stringstream(const basic_string<char_type, traits_type, SAlloc>& s,
                                 ios_base::openmode which = ios_base::out | ios_base::in); // C++20
+    basic_stringstream(const basic_stringstream&) = delete;
     basic_stringstream(basic_stringstream&& rhs);
 
     // [stringstream.assign] Assign and swap:
+    basic_stringstream& operator=(const basic_stringstream&) = delete;
     basic_stringstream& operator=(basic_stringstream&& rhs);
     void swap(basic_stringstream& rhs);
 
@@ -364,6 +372,7 @@ public:
   }
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_stringbuf(const basic_stringbuf&) = delete;
   basic_stringbuf(basic_stringbuf&& __rhs) : __mode_(__rhs.__mode_) { __move_init(std::move(__rhs)); }
 
 #if _LIBCPP_STD_VER >= 20
@@ -374,6 +383,7 @@ public:
 #endif
 
   // [stringbuf.assign] Assign and swap:
+  basic_stringbuf& operator=(const basic_stringbuf&) = delete;
   basic_stringbuf& operator=(basic_stringbuf&& __rhs);
   void swap(basic_stringbuf& __rhs)
 #if _LIBCPP_STD_VER >= 20
@@ -822,12 +832,14 @@ public:
       : basic_istream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::in) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_istringstream(const basic_istringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_istringstream(basic_istringstream&& __rhs)
       : basic_istream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [istringstream.assign] Assign and swap:
+  basic_istringstream& operator=(const basic_istringstream&) = delete;
   basic_istringstream& operator=(basic_istringstream&& __rhs) {
     basic_istream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
@@ -929,12 +941,14 @@ public:
       : basic_ostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch | ios_base::out) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_ostringstream(const basic_ostringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_ostringstream(basic_ostringstream&& __rhs)
       : basic_ostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [ostringstream.assign] Assign and swap:
+  basic_ostringstream& operator=(const basic_ostringstream&) = delete;
   basic_ostringstream& operator=(basic_ostringstream&& __rhs) {
     basic_ostream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
@@ -1040,12 +1054,14 @@ public:
       : basic_iostream<_CharT, _Traits>(std::addressof(__sb_)), __sb_(__s, __wch) {}
 #endif // _LIBCPP_STD_VER >= 20
 
+  basic_stringstream(const basic_stringstream&) = delete;
   _LIBCPP_HIDE_FROM_ABI basic_stringstream(basic_stringstream&& __rhs)
       : basic_iostream<_CharT, _Traits>(std::move(__rhs)), __sb_(std::move(__rhs.__sb_)) {
     basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
   }
 
   // [stringstream.assign] Assign and swap:
+  basic_stringstream& operator=(const basic_stringstream&) = delete;
   basic_stringstream& operator=(basic_stringstream&& __rhs) {
     basic_iostream<char_type, traits_type>::operator=(std::move(__rhs));
     __sb_ = std::move(__rhs.__sb_);
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index 577ad340d033..aad7686a435c 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -110,6 +110,7 @@ protected:
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/streambuf.h>
+#include <__type_traits/is_same.h>
 #include <climits>
 #include <ios>
 #include <iosfwd>
diff --git a/libcxx/include/string b/libcxx/include/string
index ed4fdbe6864c..530a22338604 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -938,7 +938,11 @@ public:
       // Turning off ASan instrumentation for variable initialization with _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
       // does not work consistently during initialization of __r_, so we instead unpoison __str's memory manually first.
       // __str's memory needs to be unpoisoned only in the case where it's a short string.
-      : __r_([](basic_string &__s) -> decltype(__s.__r_)&& { if(!__s.__is_long()) __s.__annotate_delete(); return std::move(__s.__r_); }(__str)) {
+      : __r_([](basic_string& __s) -> decltype(__s.__r_)&& {
+          if (!__s.__is_long())
+            __s.__annotate_delete();
+          return std::move(__s.__r_);
+        }(__str)) {
     __str.__r_.first() = __rep();
     __str.__annotate_new(0);
     if (!__is_long())
@@ -1918,7 +1922,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
-    (void) __current_size;
+    (void)__current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
@@ -1933,7 +1937,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
-    (void) __n;
+    (void)__n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
@@ -1941,7 +1945,7 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
-    (void) __old_size;
+    (void)__old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
@@ -1952,9 +1956,7 @@ private:
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __align_it(size_type __s) _NOEXCEPT {
     return (__s + (__a - 1)) & ~(__a - 1);
   }
-  enum {
-    __alignment = 8
-  };
+  enum { __alignment = 8 };
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT {
     if (__s < __min_cap) {
       return static_cast<size_type>(__min_cap) - 1;
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index c41e9242a5a8..cb20d2743097 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -211,7 +211,6 @@ template <class... Types>
 #include <__config>
 #include <__functional/invoke.h>
 #include <__fwd/array.h>
-#include <__fwd/get.h>
 #include <__fwd/tuple.h>
 #include <__memory/allocator_arg_t.h>
 #include <__memory/uses_allocator.h>
diff --git a/libcxx/include/valarray b/libcxx/include/valarray
index 44adcd71ec61..88b161eccd33 100644
--- a/libcxx/include/valarray
+++ b/libcxx/include/valarray
@@ -2435,7 +2435,7 @@ template <class _Expr, __enable_if_t<__is_val_expr<_Expr>::value, int> >
 inline valarray<_Tp>& valarray<_Tp>::operator*=(const _Expr& __v) {
   size_t __i = 0;
   for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i)
-    *__t *= std::__get(__v,__i);
+    *__t *= std::__get(__v, __i);
   return *this;
 }
 
@@ -2444,7 +2444,7 @@ template <class _Expr, __enable_if_t<__is_val_expr<_Expr>::value, int> >
 inline valarray<_Tp>& valarray<_Tp>::operator/=(const _Expr& __v) {
   size_t __i = 0;
   for (value_type* __t = __begin_; __t != __end_; ++__t, ++__i)
-    *__t /= std::__get(__v,__i);
+    *__t /= std::__get(__v, __i);
   return *this;
 }
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index e9615ab4c9a3..ce7df7a9f042 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -351,6 +351,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__type_traits/type_identity.h>
 #include <__utility/exception_guard.h>
 #include <__utility/forward.h>
+#include <__utility/is_pointer_in_range.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
@@ -831,8 +832,8 @@ private:
   // For more details, see the "Using libc++" documentation page or
   // the documentation for __sanitizer_annotate_contiguous_container.
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container(
-      const void* __old_mid, const void* __new_mid) const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+  __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const {
     (void)__old_mid;
     (void)__new_mid;
 #ifndef _LIBCPP_HAS_NO_ASAN
@@ -1580,14 +1581,13 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator
 vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) {
   pointer __p = this->__begin_ + (__position - begin());
-  // We can't compare unrelated pointers inside constant expressions
-  if (!__libcpp_is_constant_evaluated() && this->__end_ < this->__end_cap()) {
+  if (this->__end_ < this->__end_cap()) {
     if (__p == this->__end_) {
       __construct_one_at_end(__x);
     } else {
       __move_range(__p, this->__end_, __p + 1);
       const_pointer __xr = pointer_traits<const_pointer>::pointer_to(__x);
-      if (__p <= __xr && __xr < this->__end_)
+      if (std::__is_pointer_in_range(std::__to_address(__p), std::__to_address(__end_), std::addressof(__x)))
         ++__xr;
       *__p = *__xr;
     }
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 1179c253f18c..7ff604959f4d 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -16,6 +16,14 @@ New entries should be added directly below the "Version" header.
 Version 18.0
 ------------
 
+* [libc++] Moves is_terminal to the dylib
+
+  The patch moves the POSIX implementation of is_terminal to the dylib. This is
+  needed to avoid using <unistd.h> in public headers.
+
+  All platforms
+  Symbol added: _ZNSt6__ndk119__is_posix_terminalEP7__sFILE
+
 * [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534)
 
   This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI.
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index c2fea4d8adb4..2064f45bf8c0 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index a60f099b5320..fec3a4505a0c 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index a159ff522186..e52cf98dd4c4 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 5749a7520f9b..52a04706ddf2 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -534,6 +534,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP4FILE', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index e827114f1691..bced6b2ea81b 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1495,6 +1495,7 @@
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index f4077adc074e..efa2189e9c92 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1176,6 +1176,7 @@
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt6__ndk119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt6__ndk119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index e3d3fcb35d84..ebda5b0dfba5 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1190,6 +1190,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP7__sFILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 16923301d254..6432ad3be358 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -1188,6 +1188,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 2380ffb100de..1fe84e17b3f7 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -1159,6 +1159,7 @@
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutex8try_lockEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__118shared_timed_mutexC2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt3__119__is_posix_terminalEP8_IO_FILE', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base11lock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base13unlock_sharedEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt3__119__shared_mutex_base15try_lock_sharedEv', 'type': 'FUNC'}
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0388c048dacb..0dea8cfca94a 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -137,6 +137,25 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES
   std.compat/cwctype.inc
 )
 
+# TODO MODULES the CMakeLists.txt in the build directory is only temporary.
+# This allows using as available in the build directory. Once build systems
+# have proper support for the installed files this will be removed.
+if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}")
+  # This typically happens when the target is not installed.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}")
+else()
+  # It's important that the arch directory be included first so that its header files
+  # which interpose on the default include dir be included instead of the default ones.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS
+    "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}"
+  )
+endif()
+configure_file(
+  "CMakeLists.txt.in"
+  "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt"
+  @ONLY
+)
+
 set(LIBCXX_MODULE_STD_INCLUDE_SOURCES)
 foreach(file ${LIBCXX_MODULE_STD_SOURCES})
   set(
@@ -166,6 +185,7 @@ configure_file(
 )
 
 set(_all_modules)
+list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm")
 foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
new file mode 100644
index 000000000000..e332d70cc163
--- /dev/null
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -0,0 +1,88 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(libc++-modules LANGUAGES CXX)
+
+# Enable CMake's module support
+if(CMAKE_VERSION VERSION_LESS "3.28.0")
+  if(CMAKE_VERSION VERSION_LESS "3.27.0")
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a")
+  else()
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7")
+  endif()
+  set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1)
+else()
+  cmake_policy(VERSION 3.28)
+endif()
+
+# Default to C++ extensions being off. Libc++'s modules support have trouble
+# with extensions right now.
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Propagates the CMake options to the modules.
+#
+# This uses the std module hard-coded since the std.compat module does not
+# depend on these flags.
+macro(compile_define_if_not condition def)
+  if (NOT ${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+macro(compile_define_if condition def)
+  if (${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+
+### STD
+
+add_library(std)
+target_sources(std
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.cppm
+)
+
+target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std
+  PROPERTIES
+    OUTPUT_NAME   "c++std"
+)
+
+### STD.COMPAT
+
+add_library(std.compat)
+target_sources(std.compat
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.compat.cppm
+)
+
+target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std.compat PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std.compat
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+	-fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std.compat
+  PROPERTIES
+    OUTPUT_NAME   "c++std.compat"
+)
+add_dependencies(std.compat std)
diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in
index 651d6ec7b9fe..b44dbab25c74 100644
--- a/libcxx/modules/std.compat.cppm.in
+++ b/libcxx/modules/std.compat.cppm.in
@@ -46,39 +46,48 @@ module;
 #endif
 
 // *** Headers not yet available ***
-#if __has_include(<debugging>)
-#  error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<debugging>)
-#if __has_include(<flat_map>)
-#  error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_map>)
-#if __has_include(<flat_set>)
-#  error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_set>)
-#if __has_include(<generator>)
-#  error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<generator>)
-#if __has_include(<hazard_pointer>)
-#  error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<hazard_pointer>)
-#if __has_include(<linalg>)
-#  error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<linalg>)
-#if __has_include(<rcu>)
-#  error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<rcu>)
-#if __has_include(<spanstream>)
-#  error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<spanstream>)
-#if __has_include(<stacktrace>)
-#  error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stacktrace>)
-#if __has_include(<stdfloat>)
-#  error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stdfloat>)
-#if __has_include(<text_encoding>)
-#  error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<text_encoding>)
+//
+// This validation is mainly to catch when a new header is added but adding the
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
+//
+#ifndef _WIN32
+#  if __has_include(<debugging>)
+#    error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<debugging>)
+#  if __has_include(<flat_map>)
+#    error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_map>)
+#  if __has_include(<flat_set>)
+#    error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_set>)
+#  if __has_include(<generator>)
+#    error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<generator>)
+#  if __has_include(<hazard_pointer>)
+#    error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<hazard_pointer>)
+#  if __has_include(<linalg>)
+#    error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<linalg>)
+#  if __has_include(<rcu>)
+#    error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<rcu>)
+#  if __has_include(<spanstream>)
+#    error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<spanstream>)
+#  if __has_include(<stacktrace>)
+#    error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stacktrace>)
+#  if __has_include(<stdfloat>)
+#    error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stdfloat>)
+#  if __has_include(<text_encoding>)
+#    error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<text_encoding>)
+#endif // _WIN32
 
 export module std.compat;
 export import std;
diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in
index 6ce8e287737b..b8d89130aae9 100644
--- a/libcxx/modules/std.cppm.in
+++ b/libcxx/modules/std.cppm.in
@@ -168,39 +168,48 @@ module;
 #include <version>
 
 // *** Headers not yet available ***
-#if __has_include(<debugging>)
-#  error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<debugging>)
-#if __has_include(<flat_map>)
-#  error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_map>)
-#if __has_include(<flat_set>)
-#  error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<flat_set>)
-#if __has_include(<generator>)
-#  error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<generator>)
-#if __has_include(<hazard_pointer>)
-#  error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<hazard_pointer>)
-#if __has_include(<linalg>)
-#  error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<linalg>)
-#if __has_include(<rcu>)
-#  error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<rcu>)
-#if __has_include(<spanstream>)
-#  error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<spanstream>)
-#if __has_include(<stacktrace>)
-#  error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stacktrace>)
-#if __has_include(<stdfloat>)
-#  error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<stdfloat>)
-#if __has_include(<text_encoding>)
-#  error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<text_encoding>)
+//
+// This validation is mainly to catch when a new header is added but adding the
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
+//
+#ifndef _WIN32
+#  if __has_include(<debugging>)
+#    error "please update the header information for <debugging> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<debugging>)
+#  if __has_include(<flat_map>)
+#    error "please update the header information for <flat_map> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_map>)
+#  if __has_include(<flat_set>)
+#    error "please update the header information for <flat_set> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<flat_set>)
+#  if __has_include(<generator>)
+#    error "please update the header information for <generator> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<generator>)
+#  if __has_include(<hazard_pointer>)
+#    error "please update the header information for <hazard_pointer> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<hazard_pointer>)
+#  if __has_include(<linalg>)
+#    error "please update the header information for <linalg> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<linalg>)
+#  if __has_include(<rcu>)
+#    error "please update the header information for <rcu> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<rcu>)
+#  if __has_include(<spanstream>)
+#    error "please update the header information for <spanstream> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<spanstream>)
+#  if __has_include(<stacktrace>)
+#    error "please update the header information for <stacktrace> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stacktrace>)
+#  if __has_include(<stdfloat>)
+#    error "please update the header information for <stdfloat> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<stdfloat>)
+#  if __has_include(<text_encoding>)
+#    error "please update the header information for <text_encoding> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<text_encoding>)
+#endif // _WIN32
 
 export module std;
 
diff --git a/libcxx/modules/std/ostream.inc b/libcxx/modules/std/ostream.inc
index 8fcbfb4bdc18..0e0e2d54fe6b 100644
--- a/libcxx/modules/std/ostream.inc
+++ b/libcxx/modules/std/ostream.inc
@@ -33,8 +33,10 @@ export namespace std {
   using std::println;
 
   using std::vprint_nonunicode;
+#    ifndef _LIBCPP_HAS_NO_UNICODE
   using std::vprint_unicode;
-#  endif // _LIBCPP_STD_VER >= 23
+#    endif // _LIBCPP_HAS_NO_UNICODE
+#  endif   // _LIBCPP_STD_VER >= 23
 
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 3692187a5954..8fa59fdd097b 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -8,22 +8,26 @@
 
 #include <__config>
 
-#if defined(_LIBCPP_WIN32API)
+#include <cstdlib>
+#include <print>
+
+#include <__system_error/system_error.h>
 
-#  include <cstdlib>
-#  include <print>
+#include "filesystem/error.h"
 
+#if defined(_LIBCPP_WIN32API)
 #  define WIN32_LEAN_AND_MEAN
 #  define NOMINMAX
 #  include <io.h>
 #  include <windows.h>
-
-#  include <__system_error/system_error.h>
-
-#  include "filesystem/error.h"
+#elif __has_include(<unistd.h>)
+#  include <unistd.h>
+#endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_WIN32API)
+
 _LIBCPP_EXPORTED_FROM_ABI bool __is_windows_terminal(FILE* __stream) {
   // Note the Standard does this in one call, but it's unclear whether
   // an invalid handle is allowed when calling GetConsoleMode.
@@ -52,6 +56,9 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
 }
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
-_LIBCPP_END_NAMESPACE_STD
+#elif __has_include(<unistd.h>) // !_LIBCPP_WIN32API
 
-#endif // !_LIBCPP_WIN32API
+_LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); }
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
index b471c02709c0..42594461f10e 100644
--- a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
+++ b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in
@@ -45,10 +45,10 @@ config.substitutions.append(('%{flags}',
     '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- '
diff --git a/libcxx/test/configs/apple-libc++-shared.cfg.in b/libcxx/test/configs/apple-libc++-shared.cfg.in
index af1926e3859b..2d0aee3ae905 100644
--- a/libcxx/test/configs/apple-libc++-shared.cfg.in
+++ b/libcxx/test/configs/apple-libc++-shared.cfg.in
@@ -13,13 +13,13 @@ config.substitutions.append(('%{flags}',
     '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else ''
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib} -- '
+    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib-dir} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
index a39d43aec96a..8ca860300820 100644
--- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
+++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
@@ -5,7 +5,7 @@ libc_linker_script = '@CMAKE_INSTALL_PREFIX@/lib/picolibcpp.ld'
 config.substitutions.append(('%{flags}', '--sysroot=@CMAKE_INSTALL_PREFIX@'))
 
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 
     # Disable warnings in cxx_atomic_impl.h:
     # "large atomic operation may incur significant performance penalty; the
@@ -17,7 +17,7 @@ config.substitutions.append(('%{compile_flags}',
     ' -include picolibc.h'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -nostdlib++ -L %{lib} -lc++ -lc++abi'
+    '-nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
     ' -lc -lm -lclang_rt.builtins -lsemihost -lcrt0-semihost' +
     ' -T {}'.format(libc_linker_script) +
     ' -Wl,--defsym=__flash=0x0'
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 72b2ddf378bb..84b3270a8940 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -25,9 +25,9 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 # Add substitutions for bootstrapping the test suite configuration
 import shlex
 config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
-config.substitutions.append(('%{libcxx}', '@LIBCXX_SOURCE_DIR@'))
-config.substitutions.append(('%{include}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
-config.substitutions.append(('%{target-include}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
-config.substitutions.append(('%{lib}', '@LIBCXX_LIBRARY_DIR@'))
-config.substitutions.append(('%{module}', '@LIBCXX_GENERATED_MODULE_DIR@'))
-config.substitutions.append(('%{test-tools}', '@LIBCXX_TEST_TOOLS_PATH@'))
+config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
+config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
+config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
+config.substitutions.append(('%{lib-dir}', '@LIBCXX_LIBRARY_DIR@'))
+config.substitutions.append(('%{module-dir}', '@LIBCXX_GENERATED_MODULE_DIR@'))
+config.substitutions.append(('%{test-tools-dir}', '@LIBCXX_TEST_TOOLS_PATH@'))
diff --git a/libcxx/test/configs/ibm-libc++-shared.cfg.in b/libcxx/test/configs/ibm-libc++-shared.cfg.in
index 50061e9717ce..0f86e741da10 100644
--- a/libcxx/test/configs/ibm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/ibm-libc++-shared.cfg.in
@@ -12,13 +12,13 @@ if lit.util.isAIXTriple(config.target_triple):
 
 config.substitutions.append(('%{flags}', '-pthread'))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include} -I %{libcxx}/test/support'
+    '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi -latomic -Wl,-bbigtoc'
+    '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi -latomic -Wl,-bbigtoc'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env LIBPATH=%{lib} -- '
+    '%{executor} --execdir %T --env LIBPATH=%{lib-dir} -- '
 ))
 
 # LIBCXX-AIX-FIXME is the feature name used to XFAIL the
diff --git a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
index 1be85279a120..d5f1cccba3b7 100644
--- a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
@@ -16,7 +16,7 @@ config.substitutions.append(('%{flags}',
     '--sysroot @CMAKE_SYSROOT@' if '@CMAKE_SYSROOT@' else ''
 ))
 
-compile_flags = '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+compile_flags = '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 if re.match(r'i686-linux-android(21|22|23)$', config.target_triple):
     # 32-bit x86 Android has a bug where the stack is sometimes misaligned.
     # The problem appears limited to versions before Android N (API 24) and only
@@ -31,7 +31,7 @@ config.substitutions.append(('%{compile_flags}', compile_flags))
 # libc++_shared.so because older Bionic dynamic loaders don't support rpath
 # lookup.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++_shared'
+    '-nostdlib++ -L %{lib-dir} -lc++_shared'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +
diff --git a/libcxx/test/configs/llvm-libc++-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
index eb77f1142db6..8a0cc96763a1 100644
--- a/libcxx/test/configs/llvm-libc++-mingw.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
@@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', ''))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
index 50d28eb67c77..cca88c8151c6 100644
--- a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
@@ -5,13 +5,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
index a75e90b19889..7d107c8ffc16 100644
--- a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
@@ -6,10 +6,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '-pthread'))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++ -lm'
+    '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++ -lm'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
index 4c88af3dbb6e..a8ad920897e6 100644
--- a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
@@ -6,13 +6,13 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX -D_HAS_EXCEPTIONS=0' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -lc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -lc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %T --prepend_env PATH=%{lib-dir} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 143b3b3feae1..5199f64f551b 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}',
     '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '')
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++'
+    '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
index 4baaad765a92..7c700bff32b9 100644
--- a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
@@ -5,10 +5,10 @@ lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
 config.substitutions.append(('%{flags}', '--driver-mode=g++'))
 config.substitutions.append(('%{compile_flags}',
-    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
+    '-fms-runtime-lib=' + config.fms_runtime_lib + ' -nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_STDIO_ISO_WIDE_SPECIFIERS -DNOMINMAX' + config.dbg_include
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib -L %{lib} -llibc++ -l' + config.cxx_lib
+    '-nostdlib -L %{lib-dir} -llibc++ -l' + config.cxx_lib
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/configs/llvm-libc++-static.cfg.in b/libcxx/test/configs/llvm-libc++-static.cfg.in
index e866d4f52a06..097cc4d49e4b 100644
--- a/libcxx/test/configs/llvm-libc++-static.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static.cfg.in
@@ -7,10 +7,10 @@ config.substitutions.append(('%{flags}',
     '-pthread' + (' -isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '')
 ))
 config.substitutions.append(('%{compile_flags}',
-    '-nostdinc++ -I %{include} -I %{target-include} -I %{libcxx}/test/support'
+    '-nostdinc++ -I %{include-dir} -I %{target-include-dir} -I %{libcxx-dir}/test/support'
 ))
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++ -lc++abi'
+    '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor} --execdir %T -- '
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
index a4e1c3c29c93..bd883aa0c145 100644
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
+++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
@@ -10,7 +10,7 @@
 # is required for users to be able to include any public header and then override
 # the function using a strong definition.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index 26ef2077449b..e3593eefad2f 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -10,7 +10,7 @@
 # This is important notably because the LLDB data formatters use
 # libc++ headers with modules enabled.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index b2f1a171507d..19b6a999df60 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -8,7 +8,7 @@
 
 # Run our custom libc++ clang-tidy checks on all public headers.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
@@ -27,8 +27,8 @@ for header in public_headers:
 {lit_header_restrictions.get(header, '')}
 
 // TODO: run clang-tidy with modules enabled once they are supported
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
+// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
+// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
 
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index 85055dfc703d..2fcfa50db693 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -8,7 +8,7 @@
 
 # Test that we can include each header in two TU's and link them together.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
index cdbc5b34b515..faaa4cf8710c 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -9,7 +9,7 @@
 # Test that all headers include all the other headers they're supposed to, as
 # prescribed by the Standard.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/headers_in_modulemap.sh.py b/libcxx/test/libcxx/headers_in_modulemap.sh.py
index fe007f02d34e..237b006115cc 100644
--- a/libcxx/test/libcxx/headers_in_modulemap.sh.py
+++ b/libcxx/test/libcxx/headers_in_modulemap.sh.py
@@ -1,4 +1,4 @@
-# RUN: %{python} %s %{libcxx}/utils %{include}
+# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir}
 
 import sys
 
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py
index 47439b08fe51..7d9519d58955 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/libcxx/libcpp_version.gen.py
@@ -8,7 +8,7 @@
 
 # Test that all headers define the _LIBCPP_VERSION macro.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index a9a05a0cd74e..fc23985caf30 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -16,7 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 
@@ -25,9 +25,9 @@ from libcxx.test.modules import module_test_generator
 
 generator = module_test_generator(
     "%t",
-    "%{module}",
+    "%{module-dir}",
     "%{clang-tidy}",
-    "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
+    "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
     "std",
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index 270d131779e5..000aa2998612 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -16,7 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 
@@ -26,9 +26,9 @@ from libcxx.test.modules import module_test_generator
 
 generator = module_test_generator(
     "%t",
-    "%{module}",
+    "%{module-dir}",
     "%{clang-tidy}",
-    "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
+    "%{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
     "std.compat",
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py
index a5e733d2b48a..dd8006df9376 100644
--- a/libcxx/test/libcxx/no_assert_include.gen.py
+++ b/libcxx/test/libcxx/no_assert_include.gen.py
@@ -9,7 +9,7 @@
 # Ensure that none of the standard C++ headers implicitly include cassert or
 # assert.h (because assert() is implemented as a macro).
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 5b75dba544ef..0d935a18adde 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -10,7 +10,7 @@
 # alphabetic macros. Also ensure that we don't swallow the definition of user
 # provided macros (in other words, ensure that we push/pop correctly everywhere).
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index 43f92d9c0cfc..28f223c422a9 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -16,7 +16,7 @@
 # forever, however we do try to group removals for a couple of releases
 # to avoid breaking users at every release.
 
-# RUN: %{python} %s %{libcxx}/utils
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
 sys.path.append(sys.argv[1])
@@ -48,7 +48,7 @@ if regenerate_expected_results:
     all_traces.append(f'%t/trace-includes.{normalized_header}.txt')
 
   print(f"""\
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
+// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
 """)
 
 else:
@@ -83,8 +83,8 @@ else:
 
 // RUN{BLOCKLIT}: mkdir %t
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
-// RUN{BLOCKLIT}: cat %{{libcxx}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
+// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
+// RUN{BLOCKLIT}: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
 // RUN{BLOCKLIT}: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index d4c22023db7a..c65c65449f4e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -393,6 +393,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 5eb839a3aa7a..e9eae06807e6 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -396,6 +396,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 7b372fb37dcd..08db2a0ea6b1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -398,6 +398,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 7b372fb37dcd..08db2a0ea6b1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -398,6 +398,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 9b4915a468d1..69805d6af3e9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -403,6 +403,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream concepts
 istream cstddef
 istream iosfwd
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 7c7099d176f1..f9957897c743 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -105,7 +105,6 @@ codecvt string
 codecvt tuple
 codecvt typeinfo
 codecvt version
-compare cmath
 compare cstddef
 compare cstdint
 compare limits
@@ -283,6 +282,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream cstddef
 istream ostream
 istream version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 7c7099d176f1..f9957897c743 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -105,7 +105,6 @@ codecvt string
 codecvt tuple
 codecvt typeinfo
 codecvt version
-compare cmath
 compare cstddef
 compare cstdint
 compare limits
@@ -283,6 +282,7 @@ iostream istream
 iostream ostream
 iostream streambuf
 iostream version
+istream bitset
 istream cstddef
 istream ostream
 istream version
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 4f4f5839361e..508b01a7bcea 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -56,7 +56,7 @@ void test() {
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
-    // expected-error-re@*:* {{call to deleted constructor of {{.*}}}}
+    // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
     // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
diff --git a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
index 6c84e0dfff2b..3e2e080368f4 100644
--- a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
+++ b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
@@ -13,17 +13,17 @@
 
 // Make sure we install the libc++ headers in the right location.
 //
-// RUN: stat "%{include}/__config"
+// RUN: stat "%{include-dir}/__config"
 
 // Make sure we install libc++.1.dylib and libc++experimental.a in the right location.
 //
-// RUN: stat "%{lib}/libc++.1.dylib"
-// RUN: stat "%{lib}/libc++experimental.a"
+// RUN: stat "%{lib-dir}/libc++.1.dylib"
+// RUN: stat "%{lib-dir}/libc++experimental.a"
 
 // Make sure we install a symlink from libc++.dylib to libc++.1.dylib.
 //
-// RUN: stat "%{lib}/libc++.dylib"
-// RUN: readlink "%{lib}/libc++.dylib" | grep "libc++.1.dylib"
+// RUN: stat "%{lib-dir}/libc++.dylib"
+// RUN: readlink "%{lib-dir}/libc++.dylib" | grep "libc++.1.dylib"
 
 // Make sure the install_name is /usr/lib.
 //
@@ -34,15 +34,15 @@
 //
 // TODO: We currently don't do that correctly in the CMake build.
 //
-// XRUNX: otool -L "%{lib}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib'
-// XRUNX: ! otool -l "%{lib}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath"
+// XRUNX: otool -L "%{lib-dir}/libc++.1.dylib" | grep '/usr/lib/libc++.1.dylib'
+// XRUNX: ! otool -l "%{lib-dir}/libc++.1.dylib" | grep -E "LC_RPATH|@loader_path|@rpath"
 
 // Make sure the compatibility_version of libc++ is 1.0.0.
 // Failure to respect this can result in applications not being able to find libc++
 // when they are loaded by dyld, if the compatibility version was bumped.
 //
-// RUN: otool -L "%{lib}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0"
+// RUN: otool -L "%{lib-dir}/libc++.1.dylib" | grep "libc++.1.dylib" | grep "compatibility version 1.0.0"
 
 // Make sure we use the libdispatch backend for the PSTL.
 //
-// RUN: grep "%{include}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH'
+// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH'
diff --git a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
index 447454ee229c..7ed14921b106 100644
--- a/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
+++ b/libcxx/test/libcxx/vendor/clang-cl/static-lib-exports.sh.cpp
@@ -11,6 +11,6 @@
 // This file checks that the built static libraries don't contain dllexport
 // directives in clang-cl builds.
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++.lib" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.lib" | not grep -i "export:" > /dev/null
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.lib" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.lib" | not grep -i "export:" > /dev/null
diff --git a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
index 8f29f5ad051d..e20269f91e48 100644
--- a/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
+++ b/libcxx/test/libcxx/vendor/mingw/static-lib-exports.sh.cpp
@@ -11,6 +11,6 @@
 // This file checks that the built static libraries don't contain dllexport
 // directives in MinGW builds.
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++.a" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++.a" | not grep -i "export:" > /dev/null
 
-// RUN: llvm-readobj --coff-directives "%{lib}/libc++experimental.a" | not grep -i "export:" > /dev/null
+// RUN: llvm-readobj --coff-directives "%{lib-dir}/libc++experimental.a" | not grep -i "export:" > /dev/null
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
index b016babbb948..69d84f640d54 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
@@ -17,7 +17,7 @@
 //
 // If the library was built in c++23 mode, this test would succeed.
 //
-// Older CMake passed -std:c++latest to set C++ 20 mode on clang-cl, which
+// Older CMake passed -std:c++latest to set C++20 mode on clang-cl, which
 // hid this issue. With newer CMake versions, it passes -std:c++20 which
 // makes this fail.
 //
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp
new file mode 100644
index 000000000000..7be260ff79fa
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/istringstream/types.compile.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_istringstream
+//     : public basic_istream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_istringstream(const basic_istringstream&) = delete;
+//     basic_istringstream& operator=(const basic_istringstream&) = delete;
+//
+//     basic_istringstream(basic_istringstream&& rhs);
+//     basic_istringstream& operator=(basic_istringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_istream<char>, std::basic_istringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_istringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_istream<wchar_t>, std::basic_istringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value,
+              "");
+static_assert(std::is_same<std::basic_istringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::istringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::istringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wistringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wistringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::istringstream>::value, "");
+static_assert(std::is_move_assignable<std::istringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wistringstream>::value, "");
+static_assert(std::is_move_assignable<std::wistringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp
deleted file mode 100644
index da1b1de7358d..000000000000
--- a/libcxx/test/std/input.output/string.streams/istringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_istringstream
-//     : public basic_istream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_istream<char>, std::basic_istringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_istringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp
new file mode 100644
index 000000000000..df712bb0d097
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/ostringstream/types.compile.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_ostringstream
+//     : public basic_ostream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_ostringstream(const basic_ostringstream&) = delete;
+//     basic_ostringstream& operator=(const basic_ostringstream&) = delete;
+//
+//     basic_ostringstream(basic_ostringstream&& rhs);
+//     basic_ostringstream& operator=(basic_ostringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_ostream<char>, std::basic_ostringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_ostream<wchar_t>, std::basic_ostringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value,
+              "");
+static_assert(std::is_same<std::basic_ostringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::ostringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::ostringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wostringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wostringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::ostringstream>::value, "");
+static_assert(std::is_move_assignable<std::ostringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wostringstream>::value, "");
+static_assert(std::is_move_assignable<std::wostringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp
deleted file mode 100644
index b29419279a37..000000000000
--- a/libcxx/test/std/input.output/string.streams/ostringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_ostringstream
-//     : public basic_ostream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_ostream<char>, std::basic_ostringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_ostringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp
new file mode 100644
index 000000000000..7743d3a73b7a
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/stringbuf/types.compile.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_stringbuf
+//     : public basic_streambuf<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_stringbuf(const basic_stringbuf&) = delete;
+//     basic_stringbuf& operator=(const basic_stringbuf&) = delete;
+//
+//     basic_stringbuf(basic_stringbuf&& rhs);
+//     basic_stringbuf& operator=(basic_stringbuf&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+static_assert(std::is_base_of<std::basic_streambuf<char>, std::basic_stringbuf<char> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_streambuf<wchar_t>, std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringbuf<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::basic_stringbuf<char> >::value, "");
+static_assert(!std::is_copy_assignable<std::basic_stringbuf<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(!std::is_copy_assignable<std::basic_stringbuf<wchar_t> >::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::basic_stringbuf<char> >::value, "");
+static_assert(std::is_move_assignable<std::basic_stringbuf<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::basic_stringbuf<wchar_t> >::value, "");
+static_assert(std::is_move_assignable<std::basic_stringbuf<wchar_t> >::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp
deleted file mode 100644
index cec616947a04..000000000000
--- a/libcxx/test/std/input.output/string.streams/stringbuf/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_stringbuf
-//     : public basic_streambuf<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_streambuf<char>, std::basic_stringbuf<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_stringbuf<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp
new file mode 100644
index 000000000000..20486c85102a
--- /dev/null
+++ b/libcxx/test/std/input.output/string.streams/stringstream/types.compile.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <sstream>
+
+// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
+// class basic_stringstream
+//     : public basic_iostream<charT, traits>
+// {
+// public:
+//     typedef charT                          char_type;
+//     typedef traits                         traits_type;
+//     typedef typename traits_type::int_type int_type;
+//     typedef typename traits_type::pos_type pos_type;
+//     typedef typename traits_type::off_type off_type;
+//     typedef Allocator                      allocator_type;
+//
+//     basic_stringstream(const basic_stringstream&) = delete;
+//     basic_stringstream& operator=(const basic_stringstream&) = delete;
+//
+//     basic_stringstream(basic_stringstream&& rhs);
+//     basic_stringstream& operator=(basic_stringstream&& rhs);
+
+#include <sstream>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Types
+
+static_assert(std::is_base_of<std::basic_iostream<char>, std::basic_stringstream<char> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::char_type, char>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::traits_type, std::char_traits<char> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::int_type, std::char_traits<char>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::pos_type, std::char_traits<char>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::off_type, std::char_traits<char>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<char>::allocator_type, std::allocator<char> >::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_base_of<std::basic_iostream<wchar_t>, std::basic_stringstream<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::char_type, wchar_t>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::traits_type, std::char_traits<wchar_t> >::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::int_type, std::char_traits<wchar_t>::int_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::pos_type, std::char_traits<wchar_t>::pos_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::off_type, std::char_traits<wchar_t>::off_type>::value, "");
+static_assert(std::is_same<std::basic_stringstream<wchar_t>::allocator_type, std::allocator<wchar_t> >::value, "");
+#endif
+
+// Copy properties
+
+static_assert(!std::is_copy_constructible<std::stringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::stringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(!std::is_copy_constructible<std::wstringstream>::value, "");
+static_assert(!std::is_copy_assignable<std::wstringstream>::value, "");
+#endif
+
+// Move properties
+
+static_assert(std::is_move_constructible<std::stringstream>::value, "");
+static_assert(std::is_move_assignable<std::stringstream>::value, "");
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+static_assert(std::is_move_constructible<std::wstringstream>::value, "");
+static_assert(std::is_move_assignable<std::wstringstream>::value, "");
+#endif
diff --git a/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp
deleted file mode 100644
index 11990e6a57f8..000000000000
--- a/libcxx/test/std/input.output/string.streams/stringstream/types.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <sstream>
-
-// template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
-// class basic_stringstream
-//     : public basic_iostream<charT, traits>
-// {
-// public:
-//     typedef charT                          char_type;
-//     typedef traits                         traits_type;
-//     typedef typename traits_type::int_type int_type;
-//     typedef typename traits_type::pos_type pos_type;
-//     typedef typename traits_type::off_type off_type;
-//     typedef Allocator                      allocator_type;
-
-#include <sstream>
-#include <type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_base_of<std::basic_iostream<char>, std::basic_stringstream<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::char_type, char>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::traits_type, std::char_traits<char> >::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::int_type, std::char_traits<char>::int_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::pos_type, std::char_traits<char>::pos_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::off_type, std::char_traits<char>::off_type>::value), "");
-    static_assert((std::is_same<std::basic_stringstream<char>::allocator_type, std::allocator<char> >::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
index cde7dc53be5b..4e34ebcb46c7 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
index 251ba0f49599..07ec84bdefc2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
index cb83fb26ab1e..7694314c87bf 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
index d95e78e24e1a..bf61d0b857a0 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
@@ -9,6 +9,10 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// Libc++ when built for z/OS doesn't contain the aligned allocation functions,
+// nor does the dynamic library shipped with z/OS.
+// XFAIL: target={{.+}}-zos{{.*}}
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/time/rep.h b/libcxx/test/std/time/rep.h
index 80a0e3c54571..ddb5c0b24931 100644
--- a/libcxx/test/std/time/rep.h
+++ b/libcxx/test/std/time/rep.h
@@ -10,6 +10,7 @@
 #define REP_H
 
 #include "test_macros.h"
+#include <type_traits>
 
 class Rep
 {
@@ -29,6 +30,28 @@ public:
 
 struct NotARep {};
 
+#if TEST_STD_VER >= 11
+// Several duration operators take a Rep parameter. Before LWG3050 this
+// parameter was constrained to be convertible from a non-const object,
+// but the code always uses a const object. So the function was SFINAE'd
+// away for this type. LWG3050 fixes the constraint to use a const
+// object.
+struct RepConstConvertibleLWG3050 {
+  operator long() = delete;
+  operator long() const { return 2; }
+};
+namespace std {
+template <>
+struct common_type<RepConstConvertibleLWG3050, int> {
+  using type = long;
+};
+template <>
+struct common_type<int, RepConstConvertibleLWG3050> {
+  using type = long;
+};
+} // namespace std
+#endif // TEST_STD_VER >= 11
+
 // std::chrono:::duration has only '*', '/' and '%' taking a "Rep" parameter
 
 // Multiplication is commutative, division is not.
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
index d580f4edf2fe..6cedd13a13cf 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_divide_duration.pass.cpp
@@ -21,6 +21,7 @@
 
 #include "test_macros.h"
 #include "truncate_fp.h"
+#include "../../rep.h"
 
 int main(int, char**)
 {
@@ -65,7 +66,17 @@ int main(int, char**)
     constexpr std::chrono::duration<double, std::ratio<3, 5> > s2(5);
     static_assert(s1 / s2 == 20./3, "");
     }
-#endif
+    {
+      std::chrono::duration<int> d(5);
+      RepConstConvertibleLWG3050 x;
+
+      {
+        auto r = d / x;
+        assert(r.count() == 2);
+        ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+      }
+    }
+#endif // TEST_STD_VER >= 11
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
index 8b8b50d940b8..df637e1c23e3 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_mod_duration.pass.cpp
@@ -18,6 +18,7 @@
 #include <chrono>
 #include <cassert>
 #include <ratio>
+#include "../../rep.h"
 
 #include "test_macros.h"
 
@@ -60,7 +61,17 @@ int main(int, char**)
     constexpr std::chrono::duration<int, std::ratio<1, 15> > r = s1 % s2;
     static_assert(r.count() == 24, "");
     }
-#endif
+    {
+      std::chrono::duration<int> d(5);
+      RepConstConvertibleLWG3050 x;
+
+      {
+        auto r = d % x;
+        assert(r.count() == 1);
+        ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+      }
+    }
+#endif // TEST_STD_VER >= 11
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
index c33103231282..d7c8c2da3c32 100644
--- a/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
+++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/op_times_rep.pass.cpp
@@ -26,28 +26,27 @@
 #include "test_macros.h"
 #include "../../rep.h"
 
-int main(int, char**)
-{
-    {
+int main(int, char**) {
+  {
     std::chrono::nanoseconds ns(3);
     ns = ns * 5;
     assert(ns.count() == 15);
     ns = 6 * ns;
     assert(ns.count() == 90);
-    }
+  }
 
 #if TEST_STD_VER >= 11
-    {
+  {
     constexpr std::chrono::nanoseconds ns(3);
     constexpr std::chrono::nanoseconds ns2 = ns * 5;
     static_assert(ns2.count() == 15, "");
     constexpr std::chrono::nanoseconds ns3 = 6 * ns;
     static_assert(ns3.count() == 18, "");
-    }
+  }
 #endif
 
 #if TEST_STD_VER >= 11
-    { // This is related to PR#41130
+  { // This is related to PR#41130
     typedef std::chrono::nanoseconds Duration;
     Duration d(5);
     NotARep n;
@@ -57,8 +56,23 @@ int main(int, char**)
     assert(d.count() == 5);
     d = n * d;
     assert(d.count() == 5);
+  }
+  {
+    std::chrono::duration<int> d(8);
+    RepConstConvertibleLWG3050 x;
+
+    {
+      auto r = d * x;
+      assert(r.count() == 16);
+      ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
     }
-#endif
+    {
+      auto r = x * d;
+      assert(r.count() == 16);
+      ASSERT_SAME_TYPE(std::chrono::duration<long>, decltype(r));
+    }
+  }
+#endif // TEST_STD_VER >= 11
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
index b628a2c76acf..a6aa590ee944 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/from_chars_result.operator_bool.pass.cpp
@@ -28,13 +28,13 @@ constexpr bool test() {
   {
     std::from_chars_result value{nullptr, std::errc{}};
     assert(bool(value) == true);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
   // False
   {
     std::from_chars_result value{nullptr, std::errc::value_too_large};
     assert(bool(value) == false);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
 
   return true;
diff --git a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
index ef9364d3a647..621eb8a493fd 100644
--- a/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/charconv/charconv.syn/to_chars_result.operator_bool.pass.cpp
@@ -28,13 +28,13 @@ constexpr bool test() {
   {
     std::to_chars_result value{nullptr, std::errc{}};
     assert(bool(value) == true);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
   // False
   {
     std::to_chars_result value{nullptr, std::errc::value_too_large};
     assert(bool(value) == false);
-    static_assert(noexcept(bool(true)) == true);
+    static_assert(noexcept(bool(value)) == true);
   }
 
   return true;
diff --git a/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp
new file mode 100644
index 000000000000..9fc91e1f2ce2
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.arithmetic/R1_R2_requirement.verify.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+
+#include <ratio>
+
+struct invalid {
+  static const int num = 1;
+  static const int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+namespace add {
+using valid_valid = std::ratio_add<valid, valid>::type;
+using invalid_valid =
+    std::ratio_add<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_add<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace add
+
+namespace subtract {
+using valid_valid = std::ratio_subtract<valid, valid>::type;
+using invalid_valid =
+    std::ratio_subtract<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_subtract<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace subtract
+
+namespace multiply {
+using valid_valid = std::ratio_multiply<valid, valid>::type;
+using invalid_valid =
+    std::ratio_multiply<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_multiply<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace multiply
+
+namespace divide {
+using valid_valid = std::ratio_divide<valid, valid>::type;
+using invalid_valid =
+    std::ratio_divide<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_divide<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace divide
diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp
new file mode 100644
index 000000000000..03bb266b6074
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement.verify.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+//
+// Since all std::ratio_xxx_v variables use the same instantiation, only one
+// error will be generated. These values are tested in a separate test.
+
+#include <ratio>
+
+struct invalid {
+  static const int num = 1;
+  static const int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+namespace equal {
+using valid_valid = std::ratio_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_equal<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_equal<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace equal
+
+namespace not_equal {
+using valid_valid = std::ratio_not_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_not_equal<invalid,
+                         valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_not_equal<valid,
+                         invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace not_equal
+
+namespace less {
+using valid_valid = std::ratio_less<valid, valid>::type;
+using invalid_valid =
+    std::ratio_less<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_less<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace less
+
+namespace less_equal {
+using valid_valid = std::ratio_less_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_less_equal<invalid,
+                          valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_less_equal<valid,
+                          invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace less_equal
+
+namespace greater {
+using valid_valid = std::ratio_greater<valid, valid>::type;
+using invalid_valid =
+    std::ratio_greater<invalid, valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_greater<valid, invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace greater
+
+namespace greater_equal {
+using valid_valid = std::ratio_greater_equal<valid, valid>::type;
+using invalid_valid =
+    std::ratio_greater_equal<invalid,
+                             valid>::type; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+using valid_invalid =
+    std::ratio_greater_equal<valid,
+                             invalid>::type; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+} // namespace greater_equal
diff --git a/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp
new file mode 100644
index 000000000000..fbcf358894cf
--- /dev/null
+++ b/libcxx/test/std/utilities/ratio/ratio.comparison/R1_R2_requirement_v.verify.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <ratio>
+//
+// [ratio.general]/2
+//   Throughout subclause [ratio], the names of template parameters are
+//   used to express type requirements. If a template parameter is named
+//   R1 or R2, and the template argument is not a specialization of the
+//   ratio template, the program is ill-formed.
+//
+// Since all std::ratio_xxx_v variables use the same instantiation, only one
+// error will be generated. These values are tested in a separate test.
+
+#include <ratio>
+
+struct invalid {
+  constexpr static int num = 1;
+  constexpr static int den = 1;
+};
+
+using valid = std::ratio<1, 1>;
+
+void test() {
+  // equal
+  (void)std::ratio_equal_v<valid, valid>;
+  (void)std::ratio_equal_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_equal_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // not_equal
+  (void)std::ratio_not_equal_v<valid, valid>;
+  (void)std::ratio_not_equal_v<invalid,
+                               valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_not_equal_v<valid,
+                               invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // less
+  (void)std::ratio_less_v<valid, valid>;
+  (void)std::ratio_less_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_less_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // less_equal
+  (void)std::ratio_less_equal_v<valid, valid>;
+  (void)std::ratio_less_equal_v<invalid,
+                                valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_less_equal_v<valid,
+                                invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // greater
+  (void)std::ratio_greater_v<valid, valid>;
+  (void)std::ratio_greater_v<invalid, valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+  (void)std::ratio_greater_v<valid, invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+
+  // greater_equal
+  (void)std::ratio_greater_equal_v<valid, valid>;
+
+  (void)std::ratio_greater_equal_v<invalid,
+                                   valid>; // expected-error@*:* {{R1 to be a specialisation of the ratio template}}
+
+  (void)std::ratio_greater_equal_v<valid,
+                                   invalid>; // expected-error@*:* {{R2 to be a specialisation of the ratio template}}
+}
diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
index 5157a45452fa..0a48f855fba0 100644
--- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
@@ -69,50 +69,18 @@ header_exportable_declarations::header_exportable_declarations(
   }
 
   std::optional<llvm::StringRef> list = Options.get("SkipDeclarations");
-  // TODO(LLVM-17) Remove clang 15 work-around.
-#if defined(__clang_major__) && __clang_major__ < 16
-  if (list) {
-    std::string_view s = *list;
-    auto b             = s.begin();
-    auto e             = std::find(b, s.end(), ' ');
-    while (b != e) {
-      skip_decls_.emplace(b, e);
-      if (e == s.end())
-        break;
-      b = e + 1;
-      e = std::find(b, s.end(), ' ');
-    }
-  }
-#else  // defined(__clang_major__) && __clang_major__ < 16
   if (list)
     for (auto decl : std::views::split(*list, ' ')) {
       std::string s;
       std::ranges::copy(decl, std::back_inserter(s)); // use range based constructor
       skip_decls_.emplace(std::move(s));
     }
-#endif // defined(__clang_major__) && __clang_major__ < 16
   decls_ = skip_decls_;
 
   list = Options.get("ExtraDeclarations");
-  // TODO(LLVM-17) Remove clang 15 work-around.
-#if defined(__clang_major__) && __clang_major__ < 16
-  if (list) {
-    std::string_view s = *list;
-    auto b             = s.begin();
-    auto e             = std::find(b, s.end(), ' ');
-    while (b != e) {
-      std::cout << "using ::" << std::string_view{b, e} << ";\n";
-      if (e == s.end())
-        break;
-      b = e + 1;
-      e = std::find(b, s.end(), ' ');
-    }
-  }
-#else  // defined(__clang_major__) && __clang_major__ < 16
   if (list)
     for (auto decl : std::views::split(*list, ' '))
       std::cout << "using ::" << std::string_view{decl.data(), decl.size()} << ";\n";
-#endif // defined(__clang_major__) && __clang_major__ < 16
 }
 
 header_exportable_declarations::~header_exportable_declarations() {
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index cc1fc50ce412..9e7ea86f2daa 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1769,7 +1769,7 @@ def pad_cell(s, length, left_align=True):
 def get_status_table():
     table = [["Macro Name", "Value"]]
     for std in get_std_dialects():
-        table += [["**" + std.replace("c++", "C++ ") + "**", ""]]
+        table += [["**" + std.replace("c++", "C++") + "**", ""]]
         for tc in feature_test_macros:
             if std not in tc["values"].keys():
                 continue
diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py
index 2d3f829847fb..e98ac1b24331 100644
--- a/libcxx/utils/generate_libcxx_cppm_in.py
+++ b/libcxx/utils/generate_libcxx_cppm_in.py
@@ -57,18 +57,31 @@ module;
             else:
                 module_cpp_in.write(f"#include <{header}>\n")
 
-        module_cpp_in.write("\n// *** Headers not yet available ***\n")
+        module_cpp_in.write(
+            """
+// *** Headers not yet available ***
+//
+// This validation is mainly to catch when a new header is added but adding the
+// corresponding .inc file is forgotten. However, the check based on __has_include
+// alone doesn't work on Windows because the Windows SDK is on the include path,
+// and that means the MSVC STL headers can be found as well, tricking __has_include
+// into thinking that libc++ provides the header.
+//
+#ifndef _WIN32
+"""
+        )
         for header in sorted(headers_not_available):
             module_cpp_in.write(
                 f"""\
-#if __has_include(<{header}>)
-#  error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py"
-#endif // __has_include(<{header}>)
+#  if __has_include(<{header}>)
+#    error "please update the header information for <{header}> in headers_not_available in utils/libcxx/header_information.py"
+#  endif // __has_include(<{header}>)
 """
             )
 
         module_cpp_in.write(
-            f"""
+            f"""#endif // _WIN32
+
 export module {module};
 {'export import std;' if module == 'std.compat' else ''}
 
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index ae719a1d4745..6ef40755c59d 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -27,7 +27,7 @@ _msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(com
 def _getSuitableClangTidy(cfg):
     try:
         # If we didn't build the libcxx-tidy plugin via CMake, we can't run the clang-tidy tests.
-        if runScriptExitCode(cfg, ["stat %{test-tools}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0:
+        if runScriptExitCode(cfg, ["stat %{test-tools-dir}/clang_tidy_checks/libcxx-tidy.plugin"]) != 0:
             return None
 
         # TODO MODULES require ToT due module specific fixes.
@@ -526,12 +526,94 @@ DEFAULT_FEATURES += [
 # target that doesn't support it will fail at compile time, not at runtime. This can
 # be achieved by creating a `.verify.cpp` test that checks for the right errors, and
 # mark that test as requiring `stdlib=<vendor>-libc++ && target=<target>`.
+#
+# Since it is not always known which deployment target to pick there are
+# short-hands based on the LLVM version like using-built-library-before-llvm-xx.
+# These short-hands make it easy for libc++ developers to select the proper
+# version the feature will be available in and allows vendors to set the proper
+# target information.
 DEFAULT_FEATURES += [
+    # Backdeployment short-hands
+    Feature(
+        name="using-built-library-before-llvm-11",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0)(.0)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="using-built-library-before-llvm-12",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-11 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx12.{{(0|1|2)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-13",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-12 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx{{((12.(3|4|5|6|7))|(13.(0|1|2|3)))}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-14",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-13",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-15",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-14 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx13.{{(4|5|6)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-16",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-15 || (stdlib=apple-libc++ && target={{.+}}-apple-macosx14.{{(0|1|2|3)}}.0)",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-17",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-16",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-18",
+        when=lambda cfg: BooleanExpression.evaluate(
+            # For now, no released version of macOS contains LLVM 18
+            # TODO(ldionne) Please provide the correct value.
+            "using-built-library-before-llvm-17 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}",
+            cfg.available_features,
+        ),
+    ),
+
+    Feature(
+        name="using-built-library-before-llvm-19",
+        when=lambda cfg: BooleanExpression.evaluate(
+            # For now, no released version of macOS contains LLVM 19
+            # TODO(ldionne) Please provide the correct value.
+            "using-built-library-before-llvm-18 || stdlib=apple-libc++ && target={{.+}}-apple-macosx{{.+}}",
+            cfg.available_features,
+        ),
+    ),
+
     # Tests that require std::to_chars(floating-point) in the built library
     Feature(
         name="availability-fp_to_chars-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -539,7 +621,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-char8_t_support-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0)(.0)?}}",
+            "using-built-library-before-llvm-11",
             cfg.available_features,
         ),
     ),
@@ -547,7 +629,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-verbose_abort-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -555,7 +637,7 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-pmr-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "using-built-library-before-llvm-13",
             cfg.available_features,
         ),
     ),
@@ -579,8 +661,15 @@ DEFAULT_FEATURES += [
     Feature(
         name="availability-tzdb-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            # TODO(ldionne) Please provide the correct value.
-            "(stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}})",
+            "using-built-library-before-llvm-19",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require support for <print> and std::print in <ostream> in the built library.
+    Feature(
+        name="availability-print-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "using-built-library-before-llvm-18",
             cfg.available_features,
         ),
     ),
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 13175214879c..229da224994f 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -172,7 +172,7 @@ def parseScript(test, preamble):
                 f"{compileFlags} "
                 "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
                 "-fmodule-file=std=%T/std.pcm " # The std.compat module imports std.
-                "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm",
+                "--precompile -o %T/std.compat.pcm -c %{module-dir}/std.compat.cppm",
             )
             moduleCompileFlags.extend(
                 ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"]
@@ -188,7 +188,7 @@ def parseScript(test, preamble):
             "%dbg(MODULE std) %{cxx} %{flags} "
             f"{compileFlags} "
             "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-            "--precompile -o %T/std.pcm -c %{module}/std.cppm",
+            "--precompile -o %T/std.pcm -c %{module-dir}/std.cppm",
         )
         moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"])
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index e0afb6b18805..22ee2f133be9 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1825,7 +1825,15 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     }
   } else {
     config->repro = false;
-    config->timestamp = time(nullptr);
+    if (std::optional<std::string> epoch =
+            Process::GetEnv("SOURCE_DATE_EPOCH")) {
+      StringRef value(*epoch);
+      if (value.getAsInteger(0, config->timestamp))
+        fatal(Twine("invalid SOURCE_DATE_EPOCH timestamp: ") + value +
+              ".  Expected 32-bit integer");
+    } else {
+      config->timestamp = time(nullptr);
+    }
   }
 
   // Handle /alternatename
diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp
new file mode 100644
index 000000000000..d37db6877559
--- /dev/null
+++ b/lld/ELF/Arch/SystemZ.cpp
@@ -0,0 +1,607 @@
+//===- SystemZ.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OutputSections.h"
+#include "Symbols.h"
+#include "SyntheticSections.h"
+#include "Target.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace llvm::ELF;
+using namespace lld;
+using namespace lld::elf;
+
+namespace {
+class SystemZ : public TargetInfo {
+public:
+  SystemZ();
+  int getTlsGdRelaxSkip(RelType type) const override;
+  RelExpr getRelExpr(RelType type, const Symbol &s,
+                     const uint8_t *loc) const override;
+  RelType getDynRel(RelType type) const override;
+  void writeGotHeader(uint8_t *buf) const override;
+  void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
+  void writePltHeader(uint8_t *buf) const override;
+  void addPltHeaderSymbols(InputSection &isd) const override;
+  void writePlt(uint8_t *buf, const Symbol &sym,
+                uint64_t pltEntryAddr) const override;
+  RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
+  RelExpr adjustGotPcExpr(RelType type, int64_t addend,
+                          const uint8_t *loc) const override;
+  bool relaxOnce(int pass) const override;
+  void relocate(uint8_t *loc, const Relocation &rel,
+                uint64_t val) const override;
+  int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+
+private:
+  void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+};
+} // namespace
+
+SystemZ::SystemZ() {
+  copyRel = R_390_COPY;
+  gotRel = R_390_GLOB_DAT;
+  pltRel = R_390_JMP_SLOT;
+  relativeRel = R_390_RELATIVE;
+  iRelativeRel = R_390_IRELATIVE;
+  symbolicRel = R_390_64;
+  tlsGotRel = R_390_TLS_TPOFF;
+  tlsModuleIndexRel = R_390_TLS_DTPMOD;
+  tlsOffsetRel = R_390_TLS_DTPOFF;
+  gotHeaderEntriesNum = 3;
+  gotPltHeaderEntriesNum = 0;
+  gotEntrySize = 8;
+  pltHeaderSize = 32;
+  pltEntrySize = 32;
+  ipltEntrySize = 32;
+
+  // This "trap instruction" is used to fill gaps between sections.
+  // On SystemZ, the behavior of the GNU ld is to fill those gaps
+  // with nop instructions instead - and unfortunately the default
+  // glibc crt object files (used to) rely on that behavior since
+  // they use an alignment on the .init section fragments that causes
+  // gaps which must be filled with nops as they are being executed.
+  // Therefore, we provide a nop instruction as "trapInstr" here.
+  trapInstr = {0x07, 0x07, 0x07, 0x07};
+
+  defaultImageBase = 0x1000000;
+}
+
+RelExpr SystemZ::getRelExpr(RelType type, const Symbol &s,
+                            const uint8_t *loc) const {
+  switch (type) {
+  case R_390_NONE:
+    return R_NONE;
+  // Relocations targeting the symbol value.
+  case R_390_8:
+  case R_390_12:
+  case R_390_16:
+  case R_390_20:
+  case R_390_32:
+  case R_390_64:
+    return R_ABS;
+  case R_390_PC16:
+  case R_390_PC32:
+  case R_390_PC64:
+  case R_390_PC12DBL:
+  case R_390_PC16DBL:
+  case R_390_PC24DBL:
+  case R_390_PC32DBL:
+    return R_PC;
+  case R_390_GOTOFF16:
+  case R_390_GOTOFF: // a.k.a. R_390_GOTOFF32
+  case R_390_GOTOFF64:
+    return R_GOTREL;
+  // Relocations targeting the PLT associated with the symbol.
+  case R_390_PLT32:
+  case R_390_PLT64:
+  case R_390_PLT12DBL:
+  case R_390_PLT16DBL:
+  case R_390_PLT24DBL:
+  case R_390_PLT32DBL:
+    return R_PLT_PC;
+  case R_390_PLTOFF16:
+  case R_390_PLTOFF32:
+  case R_390_PLTOFF64:
+    return R_PLT_GOTREL;
+  // Relocations targeting the GOT entry associated with the symbol.
+  case R_390_GOTENT:
+    return R_GOT_PC;
+  case R_390_GOT12:
+  case R_390_GOT16:
+  case R_390_GOT20:
+  case R_390_GOT32:
+  case R_390_GOT64:
+    return R_GOT_OFF;
+  // Relocations targeting the GOTPLT entry associated with the symbol.
+  case R_390_GOTPLTENT:
+    return R_GOTPLT_PC;
+  case R_390_GOTPLT12:
+  case R_390_GOTPLT16:
+  case R_390_GOTPLT20:
+  case R_390_GOTPLT32:
+  case R_390_GOTPLT64:
+    return R_GOTPLT_GOTREL;
+  // Relocations targeting _GLOBAL_OFFSET_TABLE_.
+  case R_390_GOTPC:
+  case R_390_GOTPCDBL:
+    return R_GOTONLY_PC;
+  // TLS-related relocations.
+  case R_390_TLS_LOAD:
+    return R_NONE;
+  case R_390_TLS_GDCALL:
+    return R_TLSGD_PC;
+  case R_390_TLS_LDCALL:
+    return R_TLSLD_PC;
+  case R_390_TLS_GD32:
+  case R_390_TLS_GD64:
+    return R_TLSGD_GOT;
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDM64:
+    return R_TLSLD_GOT;
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LDO64:
+    return R_DTPREL;
+  case R_390_TLS_LE32:
+  case R_390_TLS_LE64:
+    return R_TPREL;
+  case R_390_TLS_IE32:
+  case R_390_TLS_IE64:
+    return R_GOT;
+  case R_390_TLS_GOTIE12:
+  case R_390_TLS_GOTIE20:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GOTIE64:
+    return R_GOT_OFF;
+  case R_390_TLS_IEENT:
+    return R_GOT_PC;
+
+  default:
+    error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
+          ") against symbol " + toString(s));
+    return R_NONE;
+  }
+}
+
+void SystemZ::writeGotHeader(uint8_t *buf) const {
+  // _GLOBAL_OFFSET_TABLE_[0] holds the value of _DYNAMIC.
+  // _GLOBAL_OFFSET_TABLE_[1] and [2] are reserved.
+  write64be(buf, mainPart->dynamic->getVA());
+}
+
+void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const {
+  write64be(buf, s.getPltVA() + 14);
+}
+
+void SystemZ::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
+  if (config->writeAddends)
+    write64be(buf, s.getVA());
+}
+
+void SystemZ::writePltHeader(uint8_t *buf) const {
+  const uint8_t pltData[] = {
+      0xe3, 0x10, 0xf0, 0x38, 0x00, 0x24, // stg     %r1,56(%r15)
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,_GLOBAL_OFFSET_TABLE_
+      0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc     48(8,%r15),8(%r1)
+      0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg      %r1,16(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+      0x07, 0x00,                         // nopr
+  };
+  memcpy(buf, pltData, sizeof(pltData));
+  uint64_t got = in.got->getVA();
+  uint64_t plt = in.plt->getVA();
+  write32be(buf + 8, (got - plt - 6) >> 1);
+}
+
+void SystemZ::addPltHeaderSymbols(InputSection &isec) const {
+  // The PLT header needs a reference to _GLOBAL_OFFSET_TABLE_, so we
+  // must ensure the .got section is created even if otherwise unused.
+  in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
+}
+
+void SystemZ::writePlt(uint8_t *buf, const Symbol &sym,
+                       uint64_t pltEntryAddr) const {
+  const uint8_t inst[] = {
+      0xc0, 0x10, 0x00, 0x00, 0x00, 0x00, // larl    %r1,<.got.plt slot>
+      0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg      %r1,0(%r1)
+      0x07, 0xf1,                         // br      %r1
+      0x0d, 0x10,                         // basr    %r1,%r0
+      0xe3, 0x10, 0x10, 0x0c, 0x00, 0x14, // lgf     %r1,12(%r1)
+      0xc0, 0xf4, 0x00, 0x00, 0x00, 0x00, // jg      <plt header>
+      0x00, 0x00, 0x00, 0x00,             // <relocation offset>
+  };
+  memcpy(buf, inst, sizeof(inst));
+
+  write32be(buf + 2, (sym.getGotPltVA() - pltEntryAddr) >> 1);
+  write32be(buf + 24, (in.plt->getVA() - pltEntryAddr - 22) >> 1);
+  write32be(buf + 28, in.relaPlt->entsize * sym.getPltIdx());
+}
+
+int64_t SystemZ::getImplicitAddend(const uint8_t *buf, RelType type) const {
+  switch (type) {
+  case R_390_8:
+    return SignExtend64<8>(*buf);
+  case R_390_16:
+  case R_390_PC16:
+    return SignExtend64<16>(read16be(buf));
+  case R_390_PC16DBL:
+    return SignExtend64<16>(read16be(buf)) << 1;
+  case R_390_32:
+  case R_390_PC32:
+    return SignExtend64<32>(read32be(buf));
+  case R_390_PC32DBL:
+    return SignExtend64<32>(read32be(buf)) << 1;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+  case R_390_GLOB_DAT:
+  case R_390_RELATIVE:
+  case R_390_IRELATIVE:
+    return read64be(buf);
+  case R_390_COPY:
+  case R_390_JMP_SLOT:
+  case R_390_NONE:
+    // These relocations are defined as not having an implicit addend.
+    return 0;
+  default:
+    internalLinkerError(getErrorLocation(buf),
+                        "cannot read addend for relocation " + toString(type));
+    return 0;
+  }
+}
+
+RelType SystemZ::getDynRel(RelType type) const {
+  if (type == R_390_64 || type == R_390_PC64)
+    return type;
+  return R_390_NONE;
+}
+
+RelExpr SystemZ::adjustTlsExpr(RelType type, RelExpr expr) const {
+  if (expr == R_RELAX_TLS_GD_TO_IE)
+    return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
+  return expr;
+}
+
+int SystemZ::getTlsGdRelaxSkip(RelType type) const {
+  // A __tls_get_offset call instruction is marked with 2 relocations:
+  //
+  //   R_390_TLS_GDCALL / R_390_TLS_LDCALL: marker relocation
+  //   R_390_PLT32DBL: __tls_get_offset
+  //
+  // After the relaxation we no longer call __tls_get_offset and should skip
+  // both relocations to not create a false dependence on __tls_get_offset
+  // being defined.
+  //
+  // Note that this mechanism only works correctly if the R_390_TLS_[GL]DCALL
+  // is seen immediately *before* the R_390_PLT32DBL.  Unfortunately, current
+  // compilers on the platform will typically generate the inverse sequence.
+  // To fix this, we sort relocations by offset in RelocationScanner::scan;
+  // this ensures the correct sequence as the R_390_TLS_[GL]DCALL applies to
+  // the first byte of the brasl instruction, while the R_390_PLT32DBL applies
+  // to its third byte (the relative displacement).
+
+  if (type == R_390_TLS_GDCALL || type == R_390_TLS_LDCALL)
+    return 2;
+  return 1;
+}
+
+void SystemZ::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@TLSGD                  R_390_TLS_GD64   x
+  //
+  // Relaxing to initial-exec entails:
+  // 1) Replacing the call by a load from the GOT.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_GOTIE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> lg %r2,0(%r2,%r12)
+    write16be(loc, 0xe322);
+    write32be(loc + 2, 0xc0000004);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_GOTIE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The general-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_GDCALL x
+  //            :tls_gdcall:x         R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  //
+  // .LC0:
+  // .quad   x@tlsgd                  R_390_TLS_GD64   x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the relocation on the constant LC0 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_GDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_GD64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
+  }
+}
+
+void SystemZ::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
+                             uint64_t val) const {
+  // The local-dynamic code sequence for a global `x`:
+  //
+  // Instruction                      Relocation       Symbol
+  // ear %rX,%a0
+  // sllg %rX,%rX,32
+  // ear %rX,%a1
+  // larl %r12,_GLOBAL_OFFSET_TABLE_  R_390_GOTPCDBL   _GLOBAL_OFFSET_TABLE_
+  // lgrl %r2,.LC0                    R_390_PC32DBL    .LC0
+  // brasl %r14,__tls_get_offset@plt  R_390_TLS_LDCALL <sym>
+  //            :tls_ldcall:<sym>     R_390_PLT32DBL   __tls_get_offset
+  // la %r2,0(%r2,%rX)
+  // lgrl %rY,.LC1                    R_390_PC32DBL    .LC1
+  // la %r2,0(%r2,%rY)
+  //
+  // .LC0:
+  // .quad   <sym>@tlsldm             R_390_TLS_LDM64  <sym>
+  // .LC1:
+  // .quad   x@dtpoff                 R_390_TLS_LDO64  x
+  //
+  // Relaxing to local-exec entails:
+  // 1) Replacing the call by a nop.
+  // 2) Replacing the constant LC0 by 0 (i.e. ignoring the relocation).
+  // 3) Replacing the relocation on the constant LC1 by R_390_TLS_LE64.
+
+  switch (rel.type) {
+  case R_390_TLS_LDCALL:
+    // brasl %r14,__tls_get_offset@plt -> brcl 0,.
+    write16be(loc, 0xc004);
+    write32be(loc + 2, 0x00000000);
+    break;
+  case R_390_TLS_LDM64:
+    break;
+  case R_390_TLS_LDO64:
+    relocateNoSym(loc, R_390_TLS_LE64, val);
+    break;
+  default:
+    llvm_unreachable("unsupported relocation for TLS LD to LE relaxation");
+  }
+}
+
+RelExpr SystemZ::adjustGotPcExpr(RelType type, int64_t addend,
+                                 const uint8_t *loc) const {
+  // Only R_390_GOTENT with addend 2 can be relaxed.
+  if (!config->relax || addend != 2 || type != R_390_GOTENT)
+    return R_GOT_PC;
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  // This relaxation is legal if "sym" binds locally (which was already
+  // verified by our caller) and is in-range and properly aligned for a
+  // LARL instruction.  We cannot verify the latter constraint here, so
+  // we assume it is true and revert the decision later on in relaxOnce
+  // if necessary.
+  if ((op & 0xff0f) == 0xc408)
+    return R_RELAX_GOT_PC;
+
+  return R_GOT_PC;
+}
+
+bool SystemZ::relaxOnce(int pass) const {
+  // If we decided in adjustGotPcExpr to relax a R_390_GOTENT,
+  // we need to validate the target symbol is in-range and aligned.
+  SmallVector<InputSection *, 0> storage;
+  bool changed = false;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage)) {
+      for (Relocation &rel : sec->relocs()) {
+        if (rel.expr != R_RELAX_GOT_PC)
+          continue;
+
+        uint64_t v = sec->getRelocTargetVA(
+            sec->file, rel.type, rel.addend,
+            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        if (isInt<33>(v) && !(v & 1))
+          continue;
+        if (rel.sym->auxIdx == 0) {
+          rel.sym->allocateAux();
+          addGotEntry(*rel.sym);
+          changed = true;
+        }
+        rel.expr = R_GOT_PC;
+      }
+    }
+  }
+  return changed;
+}
+
+void SystemZ::relaxGot(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  assert(isInt<33>(val) &&
+         "R_390_GOTENT should not have been relaxed if it overflows");
+  assert(!(val & 1) &&
+         "R_390_GOTENT should not have been relaxed if it is misaligned");
+  const uint16_t op = read16be(loc - 2);
+
+  // lgrl rx,sym@GOTENT -> larl rx, sym
+  if ((op & 0xff0f) == 0xc408) {
+    write16be(loc - 2, 0xc000 | (op & 0x00f0));
+    write32be(loc, val >> 1);
+  }
+}
+
+void SystemZ::relocate(uint8_t *loc, const Relocation &rel,
+                       uint64_t val) const {
+  switch (rel.expr) {
+  case R_RELAX_GOT_PC:
+    return relaxGot(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
+    return relaxTlsGdToIe(loc, rel, val);
+  case R_RELAX_TLS_GD_TO_LE:
+    return relaxTlsGdToLe(loc, rel, val);
+  case R_RELAX_TLS_LD_TO_LE:
+    return relaxTlsLdToLe(loc, rel, val);
+  default:
+    break;
+  }
+  switch (rel.type) {
+  case R_390_8:
+    checkIntUInt(loc, val, 8, rel);
+    *loc = val;
+    break;
+  case R_390_12:
+  case R_390_GOT12:
+  case R_390_GOTPLT12:
+  case R_390_TLS_GOTIE12:
+    checkUInt(loc, val, 12, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | val);
+    break;
+  case R_390_PC12DBL:
+  case R_390_PLT12DBL:
+    checkInt(loc, val, 13, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, (read16be(loc) & 0xF000) | ((val >> 1) & 0x0FFF));
+    break;
+  case R_390_16:
+  case R_390_GOT16:
+  case R_390_GOTPLT16:
+  case R_390_GOTOFF16:
+  case R_390_PLTOFF16:
+    checkIntUInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16:
+    checkInt(loc, val, 16, rel);
+    write16be(loc, val);
+    break;
+  case R_390_PC16DBL:
+  case R_390_PLT16DBL:
+    checkInt(loc, val, 17, rel);
+    checkAlignment(loc, val, 2, rel);
+    write16be(loc, val >> 1);
+    break;
+  case R_390_20:
+  case R_390_GOT20:
+  case R_390_GOTPLT20:
+  case R_390_TLS_GOTIE20:
+    checkInt(loc, val, 20, rel);
+    write32be(loc, (read32be(loc) & 0xF00000FF) | ((val & 0xFFF) << 16) |
+                       ((val & 0xFF000) >> 4));
+    break;
+  case R_390_PC24DBL:
+  case R_390_PLT24DBL:
+    checkInt(loc, val, 25, rel);
+    checkAlignment(loc, val, 2, rel);
+    loc[0] = val >> 17;
+    loc[1] = val >> 9;
+    loc[2] = val >> 1;
+    break;
+  case R_390_32:
+  case R_390_GOT32:
+  case R_390_GOTPLT32:
+  case R_390_GOTOFF:
+  case R_390_PLTOFF32:
+  case R_390_TLS_IE32:
+  case R_390_TLS_GOTIE32:
+  case R_390_TLS_GD32:
+  case R_390_TLS_LDM32:
+  case R_390_TLS_LDO32:
+  case R_390_TLS_LE32:
+    checkIntUInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32:
+  case R_390_PLT32:
+    checkInt(loc, val, 32, rel);
+    write32be(loc, val);
+    break;
+  case R_390_PC32DBL:
+  case R_390_PLT32DBL:
+  case R_390_GOTPCDBL:
+  case R_390_GOTENT:
+  case R_390_GOTPLTENT:
+  case R_390_TLS_IEENT:
+    checkInt(loc, val, 33, rel);
+    checkAlignment(loc, val, 2, rel);
+    write32be(loc, val >> 1);
+    break;
+  case R_390_64:
+  case R_390_PC64:
+  case R_390_PLT64:
+  case R_390_GOT64:
+  case R_390_GOTPLT64:
+  case R_390_GOTOFF64:
+  case R_390_PLTOFF64:
+  case R_390_GOTPC:
+  case R_390_TLS_IE64:
+  case R_390_TLS_GOTIE64:
+  case R_390_TLS_GD64:
+  case R_390_TLS_LDM64:
+  case R_390_TLS_LDO64:
+  case R_390_TLS_LE64:
+  case R_390_TLS_DTPMOD:
+  case R_390_TLS_DTPOFF:
+  case R_390_TLS_TPOFF:
+    write64be(loc, val);
+    break;
+  case R_390_TLS_LOAD:
+  case R_390_TLS_GDCALL:
+  case R_390_TLS_LDCALL:
+    break;
+  default:
+    llvm_unreachable("unknown relocation");
+  }
+}
+
+TargetInfo *elf::getSystemZTargetInfo() {
+  static SystemZ t;
+  return &t;
+}
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 475f7dea1dd7..83d816ddb060 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -33,6 +33,7 @@ add_lld_library(lldELF
   Arch/PPC64.cpp
   Arch/RISCV.cpp
   Arch/SPARCV9.cpp
+  Arch/SystemZ.cpp
   Arch/X86.cpp
   Arch/X86_64.cpp
   ARMErrataFix.cpp
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c19fba6598a8..4bb9b7a0b2a9 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -200,6 +200,7 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef emul) {
           .Case("msp430elf", {ELF32LEKind, EM_MSP430})
           .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU})
           .Case("elf64loongarch", {ELF64LEKind, EM_LOONGARCH})
+          .Case("elf64_s390", {ELF64BEKind, EM_S390})
           .Default({ELFNoneKind, EM_NONE});
 
   if (ret.first == ELFNoneKind)
@@ -1137,7 +1138,7 @@ static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
 static bool getIsRela(opt::InputArgList &args) {
   // The psABI specifies the default relocation entry format.
   bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH,
-                            EM_PPC, EM_PPC64, EM_RISCV, EM_X86_64},
+                            EM_PPC, EM_PPC64, EM_RISCV, EM_S390, EM_X86_64},
                            config->emachine);
   // If -z rel or -z rela is specified, use the last option.
   for (auto *arg : args.filtered(OPT_z)) {
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index a292e873e72f..6c7ef27cbd49 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1614,6 +1614,8 @@ static uint16_t getBitcodeMachineKind(StringRef path, const Triple &t) {
     return EM_RISCV;
   case Triple::sparcv9:
     return EM_SPARCV9;
+  case Triple::systemz:
+    return EM_S390;
   case Triple::x86:
     return t.isOSIAMCU() ? EM_IAMCU : EM_386;
   case Triple::x86_64:
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 3d726b4c4b77..e033a715b592 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -655,6 +655,7 @@ static int64_t getTlsTpOffset(const Symbol &s) {
 
     // Variant 2.
   case EM_HEXAGON:
+  case EM_S390:
   case EM_SPARCV9:
   case EM_386:
   case EM_X86_64:
@@ -717,6 +718,10 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_GOT_PC:
   case R_RELAX_TLS_GD_TO_IE:
     return sym.getGotVA() + a - p;
+  case R_GOTPLT_GOTREL:
+    return sym.getGotPltVA() + a - in.got->getVA();
+  case R_GOTPLT_PC:
+    return sym.getGotPltVA() + a - p;
   case R_LOONGARCH_GOT_PAGE_PC:
     if (sym.hasFlag(NEEDS_TLSGD))
       return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
@@ -808,6 +813,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return getLoongArchPageDelta(sym.getPltVA() + a, p, type);
   case R_PLT_GOTPLT:
     return sym.getPltVA() + a - in.gotPlt->getVA();
+  case R_PLT_GOTREL:
+    return sym.getPltVA() + a - in.got->getVA();
   case R_PPC32_PLTREL:
     // R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30
     // stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 79c8230724ad..f64b4219e0ac 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -203,8 +203,9 @@ static bool isAbsoluteValue(const Symbol &sym) {
 
 // Returns true if Expr refers a PLT entry.
 static bool needsPlt(RelExpr expr) {
-  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTPLT, R_LOONGARCH_PLT_PAGE_PC,
-               R_PPC32_PLTREL, R_PPC64_CALL_PLT>(expr);
+  return oneof<R_PLT, R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL,
+               R_GOTPLT_PC, R_LOONGARCH_PLT_PAGE_PC, R_PPC32_PLTREL,
+               R_PPC64_CALL_PLT>(expr);
 }
 
 bool lld::elf::needsGot(RelExpr expr) {
@@ -233,6 +234,8 @@ static RelExpr toPlt(RelExpr expr) {
     return R_PLT_PC;
   case R_ABS:
     return R_PLT;
+  case R_GOTREL:
+    return R_PLT_GOTREL;
   default:
     return expr;
   }
@@ -253,6 +256,8 @@ static RelExpr fromPlt(RelExpr expr) {
     return R_ABS;
   case R_PLT_GOTPLT:
     return R_GOTPLTREL;
+  case R_PLT_GOTREL:
+    return R_GOTREL;
   default:
     return expr;
   }
@@ -979,10 +984,10 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   if (oneof<R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, R_MIPS_GOT_LOCAL_PAGE,
             R_MIPS_GOTREL, R_MIPS_GOT_OFF, R_MIPS_GOT_OFF32, R_MIPS_GOT_GP_PC,
             R_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTONLY_PC, R_GOTPLTONLY_PC,
-            R_PLT_PC, R_PLT_GOTPLT, R_PPC32_PLTREL, R_PPC64_CALL_PLT,
-            R_PPC64_RELAX_TOC, R_RISCV_ADD, R_AARCH64_GOT_PAGE,
-            R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT, R_LOONGARCH_GOT_PAGE_PC>(
-          e))
+            R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL, R_GOTPLT_PC,
+            R_PPC32_PLTREL, R_PPC64_CALL_PLT, R_PPC64_RELAX_TOC, R_RISCV_ADD,
+            R_AARCH64_GOT_PAGE, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT,
+            R_LOONGARCH_GOT_PAGE_PC>(e))
     return true;
 
   // These never do, except if the entire file is position dependent or if
@@ -1374,8 +1379,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
             R_LOONGARCH_GOT_PAGE_PC, R_GOT_OFF, R_TLSIE_HINT>(expr)) {
     ctx.hasTlsIe.store(true, std::memory_order_relaxed);
     // Initial-Exec relocs can be optimized to Local-Exec if the symbol is
-    // locally defined.
-    if (execOptimize && isLocalInExecutable) {
+    // locally defined.  This is not supported on SystemZ.
+    if (execOptimize && isLocalInExecutable && config->emachine != EM_S390) {
       c.addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym});
     } else if (expr != R_TLSIE_HINT) {
       sym.setFlags(NEEDS_TLSIE);
@@ -1534,8 +1539,10 @@ void RelocationScanner::scan(ArrayRef<RelTy> rels) {
   // For EhInputSection, OffsetGetter expects the relocations to be sorted by
   // r_offset. In rare cases (.eh_frame pieces are reordered by a linker
   // script), the relocations may be unordered.
+  // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
+  // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
   SmallVector<RelTy, 0> storage;
-  if (isa<EhInputSection>(sec))
+  if (isa<EhInputSection>(sec) || config->emachine == EM_S390)
     rels = sortRels(rels, storage);
 
   end = static_cast<const void *>(rels.end());
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index cfb9092149f3..7eb8a811e693 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -40,11 +40,14 @@ enum RelExpr {
   R_GOTPLT,
   R_GOTPLTREL,
   R_GOTREL,
+  R_GOTPLT_GOTREL,
+  R_GOTPLT_PC,
   R_NONE,
   R_PC,
   R_PLT,
   R_PLT_PC,
   R_PLT_GOTPLT,
+  R_PLT_GOTREL,
   R_RELAX_HINT,
   R_RELAX_GOT_PC,
   R_RELAX_GOT_PC_NOPIC,
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index dd69916d6b05..f0ede1f43bbd 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -445,6 +445,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf32-msp430", {ELF32LEKind, EM_MSP430})
       .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH})
       .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH})
+      .Case("elf64-s390", {ELF64BEKind, EM_S390})
       .Default({ELFNoneKind, EM_NONE});
 }
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 4b413163314b..bada394aa30d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1419,6 +1419,9 @@ DynamicSection<ELFT>::computeContents() {
     case EM_MIPS:
       addInSec(DT_MIPS_PLTGOT, *in.gotPlt);
       break;
+    case EM_S390:
+      addInSec(DT_PLTGOT, *in.got);
+      break;
     case EM_SPARCV9:
       addInSec(DT_PLTGOT, *in.plt);
       break;
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 671d22cc66a0..d879a427e9c0 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -87,10 +87,13 @@ TargetInfo *elf::getTarget() {
     return getRISCVTargetInfo();
   case EM_SPARCV9:
     return getSPARCV9TargetInfo();
+  case EM_S390:
+    return getSystemZTargetInfo();
   case EM_X86_64:
     return getX86_64TargetInfo();
+  default:
+    fatal("unsupported e_machine value: " + Twine(config->emachine));
   }
-  llvm_unreachable("unknown target machine");
 }
 
 ErrorPlace elf::getErrorPlace(const uint8_t *loc) {
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index ed00e81c0e6c..0cefa3181356 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -188,6 +188,7 @@ TargetInfo *getPPC64TargetInfo();
 TargetInfo *getPPCTargetInfo();
 TargetInfo *getRISCVTargetInfo();
 TargetInfo *getSPARCV9TargetInfo();
+TargetInfo *getSystemZTargetInfo();
 TargetInfo *getX86TargetInfo();
 TargetInfo *getX86_64TargetInfo();
 template <class ELFT> TargetInfo *getMipsTargetInfo();
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6df43a34be01..5b7dfd358e76 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1104,7 +1104,9 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
 
   PhdrEntry *last = nullptr;
   PhdrEntry *lastRO = nullptr;
-
+  auto isLarge = [](OutputSection *osec) {
+    return config->emachine == EM_X86_64 && osec->flags & SHF_X86_64_LARGE;
+  };
   for (Partition &part : partitions) {
     for (PhdrEntry *p : part.phdrs) {
       if (p->p_type != PT_LOAD)
@@ -1124,10 +1126,11 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
   }
 
   if (last) {
-    // _edata points to the end of the last mapped initialized section.
+    // _edata points to the end of the last non-large mapped initialized
+    // section.
     OutputSection *edata = nullptr;
     for (OutputSection *os : outputSections) {
-      if (os->type != SHT_NOBITS)
+      if (os->type != SHT_NOBITS && !isLarge(os))
         edata = os;
       if (os == last->lastSec)
         break;
@@ -2353,17 +2356,12 @@ static bool needsPtLoad(OutputSection *sec) {
   return true;
 }
 
-// Linker scripts are responsible for aligning addresses. Unfortunately, most
-// linker scripts are designed for creating two PT_LOADs only, one RX and one
-// RW. This means that there is no alignment in the RO to RX transition and we
-// cannot create a PT_LOAD there.
+// Adjust phdr flags according to certain options.
 static uint64_t computeFlags(uint64_t flags) {
   if (config->omagic)
     return PF_R | PF_W | PF_X;
   if (config->executeOnly && (flags & PF_X))
     return flags & ~PF_R;
-  if (config->singleRoRx && !(flags & PF_W))
-    return flags | PF_X;
   return flags;
 }
 
@@ -2451,8 +2449,8 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // Segments are contiguous memory regions that has the same attributes
     // (e.g. executable or writable). There is one phdr for each segment.
     // Therefore, we need to create a new phdr when the next section has
-    // different flags or is loaded at a discontiguous address or memory region
-    // using AT or AT> linker script command, respectively.
+    // incompatible flags or is loaded at a discontiguous address or memory
+    // region using AT or AT> linker script command, respectively.
     //
     // As an exception, we don't create a separate load segment for the ELF
     // headers, even if the first "real" output has an AT or AT> attribute.
@@ -2465,13 +2463,22 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     // so when hasSectionsCommand, since we cannot introduce the extra alignment
     // needed to create a new LOAD)
     uint64_t newFlags = computeFlags(sec->getPhdrFlags());
+    // When --no-rosegment is specified, RO and RX sections are compatible.
+    uint32_t incompatible = flags ^ newFlags;
+    if (config->singleRoRx && !(newFlags & PF_W))
+      incompatible &= ~PF_X;
+    if (incompatible)
+      load = nullptr;
+
     bool sameLMARegion =
         load && !sec->lmaExpr && sec->lmaRegion == load->firstSec->lmaRegion;
-    if (!(load && newFlags == flags && sec != relroEnd &&
-          sec->memRegion == load->firstSec->memRegion &&
-          (sameLMARegion || load->lastSec == Out::programHeaders) &&
-          (script->hasSectionsCommand || sec->type == SHT_NOBITS ||
-           load->lastSec->type != SHT_NOBITS))) {
+    if (load && sec != relroEnd &&
+        sec->memRegion == load->firstSec->memRegion &&
+        (sameLMARegion || load->lastSec == Out::programHeaders) &&
+        (script->hasSectionsCommand || sec->type == SHT_NOBITS ||
+         load->lastSec->type != SHT_NOBITS)) {
+      load->p_flags |= newFlags;
+    } else {
       load = addHdr(PT_LOAD, newFlags);
       flags = newFlags;
     }
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 290eecaacda7..efd643f9a322 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -451,6 +451,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-lto-cs-profile-generate");
   if (auto *arg = args.getLastArg(OPT_lto_cs_profile_file))
     add("-lto-cs-profile-file:" + StringRef(arg->getValue()));
+  if (args.hasArg(OPT_plugin_opt_emit_llvm))
+    add("-lldemit:llvm");
+  if (args.hasArg(OPT_lto_emit_asm))
+    add("-lldemit:asm");
 
   if (auto *a = args.getLastArg(OPT_thinlto_cache_dir))
     add("-lldltocache:" + StringRef(a->getValue()));
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 02f00f27406c..9a0a96aac7f1 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -158,6 +158,8 @@ def lto_cs_profile_generate: FF<"lto-cs-profile-generate">,
   HelpText<"Perform context sensitive PGO instrumentation">;
 def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
   HelpText<"Context sensitive profile file path">;
+def lto_emit_asm: FF<"lto-emit-asm">,
+  HelpText<"Emit assembly code">;
 
 def thinlto_cache_dir: JJ<"thinlto-cache-dir=">,
   HelpText<"Path to ThinLTO cached object file directory">;
@@ -181,6 +183,9 @@ def: J<"plugin-opt=cs-profile-path=">,
   Alias<lto_cs_profile_file>, HelpText<"Alias for --lto-cs-profile-file">;
 def plugin_opt_dwo_dir_eq: J<"plugin-opt=dwo_dir=">,
   HelpText<"Directory to store .dwo files when LTO and debug fission are used">;
+def plugin_opt_emit_asm: F<"plugin-opt=emit-asm">,
+  Alias<lto_emit_asm>, HelpText<"Alias for --lto-emit-asm">;
+def plugin_opt_emit_llvm: F<"plugin-opt=emit-llvm">;
 def: J<"plugin-opt=jobs=">, Alias<thinlto_jobs_eq>, HelpText<"Alias for --thinlto-jobs=">;
 def plugin_opt_mcpu_eq: J<"plugin-opt=mcpu=">;
 
diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll
index 55244e5690dc..a46190a81b62 100644
--- a/lld/test/COFF/lto-cache-errors.ll
+++ b/lld/test/COFF/lto-cache-errors.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Not supported on windows since we use permissions to deny the creation
 ; UNSUPPORTED: system-windows
 
diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll
index a9f22c1dc2dc..b47a6cea4eb7 100644
--- a/lld/test/COFF/thinlto-emit-imports.ll
+++ b/lld/test/COFF/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Generate summary sections and test lld handling.
 ; RUN: opt -module-summary %s -o %t1.obj
diff --git a/lld/test/COFF/timestamp.test b/lld/test/COFF/timestamp.test
index fbdc5788a33a..cc73af13c38c 100644
--- a/lld/test/COFF/timestamp.test
+++ b/lld/test/COFF/timestamp.test
@@ -3,9 +3,28 @@ RUN: yaml2obj %p/Inputs/generic.yaml -o %t.obj
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.1.exe
 RUN: lld-link %t.obj /debug /Brepro /entry:main /nodefaultlib /out:%t.2.exe
 RUN: lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.3.exe
+RUN: env SOURCE_DATE_EPOCH=0 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.4.exe
+# Test timestamps corresponding to INT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:2147483647 /entry:main /nodefaultlib /out:%t.5.exe
+RUN: env SOURCE_DATE_EPOCH=2147483647 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.6.exe
+# Test that the command line option /timestamp has precedence over SOURCE_DATE_EPOCH
+RUN: env SOURCE_DATE_EPOCH=12345 lld-link %t.obj /debug /timestamp:0 /entry:main /nodefaultlib /out:%t.7.exe
+# Test timestamps corresponding to UINT32_TMAX
+RUN: lld-link %t.obj /debug /timestamp:4294967295 /entry:main /nodefaultlib /out:%t.8.exe
+RUN: env SOURCE_DATE_EPOCH=4294967295 lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.9.exe
+# Test that setting UINT32_MAX+1 as timestamp fails.
+RUN: env LLD_IN_TEST=1 not lld-link %t.obj /debug /timestamp:4294967296 /entry:main /nodefaultlib /out:%t.10.exe 2>&1 | FileCheck %s --check-prefix=ERROR
+RUN: env SOURCE_DATE_EPOCH=4294967296 env LLD_IN_TEST=1 not lld-link %t.obj /debug /entry:main /nodefaultlib /out:%t.11.exe 2>&1 | FileCheck %s --check-prefix=ERROR2
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.1.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.2.exe | FileCheck %s --check-prefix=HASH
 RUN: llvm-readobj --file-headers --coff-debug-directory %t.3.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.4.exe | FileCheck %s --check-prefix=ZERO
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.5.exe | FileCheck %s --check-prefix=LARGE
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.6.exe | FileCheck %s --check-prefix=LARGE
+RUN: llvm-readobj --file-headers --coff-debug-directory %t.7.exe | FileCheck %s --check-prefix=ZERO
+
+# Not inspecting %t.8.exe and %t.9.exe; llvm-readobj with a 32 bit time_t fails to print dates
+# past INT32_MAX correctly.
 
 HASH: ImageFileHeader {
 HASH: TimeDateStamp: [[STAMP:.*]]
@@ -16,3 +35,11 @@ ZERO: ImageFileHeader {
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
 ZERO: DebugDirectory [
 ZERO: TimeDateStamp: 1970-01-01 00:00:00 (0x0)
+
+LARGE: ImageFileHeader {
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
+LARGE: DebugDirectory [
+LARGE: TimeDateStamp: 2038-01-19 03:14:07 (0x7FFFFFFF)
+
+ERROR: error: invalid timestamp: 4294967296.  Expected 32-bit integer
+ERROR2: error: invalid SOURCE_DATE_EPOCH timestamp: 4294967296.  Expected 32-bit integer
diff --git a/lld/test/ELF/Inputs/systemz-init.s b/lld/test/ELF/Inputs/systemz-init.s
new file mode 100644
index 000000000000..1611b69b4419
--- /dev/null
+++ b/lld/test/ELF/Inputs/systemz-init.s
@@ -0,0 +1,5 @@
+// glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+// This file aims to recreate that behavior.
+	.section        .init,"ax",@progbits
+	.align	4
+	lg %r4, 272(%r15)
diff --git a/lld/test/ELF/basic-systemz.s b/lld/test/ELF/basic-systemz.s
new file mode 100644
index 000000000000..f7bb0e8cbd02
--- /dev/null
+++ b/lld/test/ELF/basic-systemz.s
@@ -0,0 +1,63 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld --hash-style=sysv -discard-all -shared %t.o -o %t.so
+# RUN: llvm-readelf --file-header --program-headers --section-headers --dynamic-table %t.so | FileCheck %s
+
+# Exits with return code 55 on linux.
+.text
+  lghi 2,55
+  svc 1
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:               0x0
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:          768 (bytes into file)
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+# CHECK-NEXT:  Number of program headers:         7
+# CHECK-NEXT:  Size of section headers:           64 (bytes)
+# CHECK-NEXT:  Number of section headers:         11
+# CHECK-NEXT:  Section header string table index: 9
+
+# CHECK:       Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .dynsym           DYNSYM          00000000000001c8 0001c8 000018 18   A  3   1  8
+# CHECK-NEXT:  [ 2] .hash             HASH            00000000000001e0 0001e0 000010 04   A  1   0  4
+# CHECK-NEXT:  [ 3] .dynstr           STRTAB          00000000000001f0 0001f0 000001 00   A  0   0  1
+# CHECK-NEXT:  [ 4] .text             PROGBITS        00000000000011f4 0001f4 000006 00  AX  0   0  4
+# CHECK-NEXT:  [ 5] .dynamic          DYNAMIC         0000000000002200 000200 000060 10  WA  3   0  8
+# CHECK-NEXT:  [ 6] .relro_padding    NOBITS          0000000000002260 000260 000da0 00  WA  0   0  1
+# CHECK-NEXT:  [ 7] .comment          PROGBITS        0000000000000000 000260 000008 01  MS  0   0  1
+# CHECK-NEXT:  [ 8] .symtab           SYMTAB          0000000000000000 000268 000030 18     10   2  8
+# CHECK-NEXT:  [ 9] .shstrtab         STRTAB          0000000000000000 000298 000058 00      0   0  1
+# CHECK-NEXT:  [10] .strtab           STRTAB          0000000000000000 0002f0 00000a 00      0   0  1
+
+# CHECK:       Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
+# CHECK-NEXT:  LOAD           0x0001f4 0x00000000000011f4 0x00000000000011f4 0x000006 0x000006 R E 0x1000
+# CHECK-NEXT:  LOAD           0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000060 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000200 0x0000000000002200 0x0000000000002200 0x000060 0x000e00 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK:       Dynamic section at offset 0x200 contains 6 entries:
+# CHECK-NEXT:  Tag                Type     Name/Value
+# CHECK-NEXT:  0x0000000000000006 (SYMTAB) 0x1c8
+# CHECK-NEXT:  0x000000000000000b (SYMENT) 24 (bytes)
+# CHECK-NEXT:  0x0000000000000005 (STRTAB) 0x1f0
+# CHECK-NEXT:  0x000000000000000a (STRSZ)  1 (bytes)
+# CHECK-NEXT:  0x0000000000000004 (HASH)   0x1e0
+# CHECK-NEXT:  0x0000000000000000 (NULL)   0x0
diff --git a/lld/test/ELF/emulation-systemz.s b/lld/test/ELF/emulation-systemz.s
new file mode 100644
index 000000000000..dfdb4620954c
--- /dev/null
+++ b/lld/test/ELF/emulation-systemz.s
@@ -0,0 +1,29 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -m elf64_s390 %t.o -o %t1
+# RUN: llvm-readelf --file-header %t1 | FileCheck %s
+# RUN: ld.lld %t.o -o %t2
+# RUN: llvm-readelf --file-header %t2 | FileCheck %s
+# RUN: echo 'OUTPUT_FORMAT(elf64-s390)' > %t.script
+# RUN: ld.lld %t.script %t.o -o %t3
+# RUN: llvm-readelf --file-header %t3 | FileCheck %s
+
+# CHECK:       ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              EXEC (Executable file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+# CHECK-NEXT:  Entry point address:
+# CHECK-NEXT:  Start of program headers:          64 (bytes into file)
+# CHECK-NEXT:  Start of section headers:
+# CHECK-NEXT:  Flags:                             0x0
+# CHECK-NEXT:  Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/lto/resolution-err.ll b/lld/test/ELF/lto/resolution-err.ll
index 6dfa64b1b8b9..f9855abaff32 100644
--- a/lld/test/ELF/lto/resolution-err.ll
+++ b/lld/test/ELF/lto/resolution-err.ll
@@ -1,5 +1,5 @@
 ; UNSUPPORTED: system-windows
-; REQUIRES: shell
+; REQUIRES: shell, non-root-user
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: touch %t.resolution.txt
 ; RUN: chmod u-w %t.resolution.txt
diff --git a/lld/test/ELF/lto/systemz.ll b/lld/test/ELF/lto/systemz.ll
new file mode 100644
index 000000000000..42bf4e32fb6d
--- /dev/null
+++ b/lld/test/ELF/lto/systemz.ll
@@ -0,0 +1,18 @@
+; REQUIRES: systemz
+;; Test we can infer the e_machine value EM_S390 from a bitcode file.
+
+; RUN: llvm-as %s -o %t.o
+; RUN: ld.lld %t.o -o %t
+; RUN: llvm-readobj -h %t | FileCheck %s
+
+; CHECK: Class: 64-bit
+; CHECK: DataEncoding: BigEndian
+; CHECK: Machine: EM_S390
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-unknown-linux-gnu"
+
+define void @_start() {
+entry:
+  ret void
+}
diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll
index e664acbb17de..286fcddd4238 100644
--- a/lld/test/ELF/lto/thinlto-cant-write-index.ll
+++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ; Basic ThinLTO tests.
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll
index 6d0e1e65047d..253ec08619c9 100644
--- a/lld/test/ELF/lto/thinlto-emit-imports.ll
+++ b/lld/test/ELF/lto/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ;; Test a few properties not tested by thinlto-index-only.ll
 
 ; RUN: opt -module-summary %s -o %t1.o
diff --git a/lld/test/ELF/riscv-section-layout.s b/lld/test/ELF/riscv-section-layout.s
index 56ac95d01fdf..10e0febf7c42 100644
--- a/lld/test/ELF/riscv-section-layout.s
+++ b/lld/test/ELF/riscv-section-layout.s
@@ -3,20 +3,20 @@
 
 # RUN: llvm-mc -filetype=obj -triple=riscv32 %s -o %t.32.o
 # RUN: ld.lld -pie %t.32.o -o %t.32
-# RUN: llvm-readelf -S -s %t.32 | FileCheck %s --check-prefix=NOSDATA
+# RUN: llvm-readelf -S -sX %t.32 | FileCheck %s --check-prefix=NOSDATA
 # RUN: llvm-mc -filetype=obj -triple=riscv32 --defsym=SDATA=1 %s -o %t.32s.o
 # RUN: ld.lld -pie %t.32s.o -o %t.32s
-# RUN: llvm-readelf -S -s %t.32s | FileCheck %s
+# RUN: llvm-readelf -S -sX %t.32s | FileCheck %s
 
 # RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t.64.o
 # RUN: ld.lld -pie %t.64.o -o %t.64
-# RUN: llvm-readelf -S -s %t.64 | FileCheck %s --check-prefix=NOSDATA
+# RUN: llvm-readelf -S -sX %t.64 | FileCheck %s --check-prefix=NOSDATA
 # RUN: llvm-mc -filetype=obj -triple=riscv64 --defsym=SDATA=1 %s -o %t.64s.o
 # RUN: ld.lld -pie %t.64s.o -o %t.64s
-# RUN: llvm-readelf -S -s %t.64s | FileCheck %s
+# RUN: llvm-readelf -S -sX %t.64s | FileCheck %s
 
 # NOSDATA:      .text
-# NOSDATA-NEXT: .tdata
+# NOSDATA-NEXT: .tdata   PROGBITS [[#%x,TDATA:]]
 # NOSDATA-NEXT: .tbss
 # NOSDATA-NEXT: .dynamic
 # NOSDATA-NEXT: .got
@@ -28,9 +28,10 @@
 ## exist, define __global_pointer$ and set its st_shndx arbitrarily to 1.
 ## The symbol value should not be used by the program.
 
-# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] _edata
-# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] __bss_start
-# NOSDATA-DAG:  [[#]]: {{0*}}800              0 NOTYPE  GLOBAL DEFAULT     1 __global_pointer$
+# NOSDATA-DAG:  [[#]]: {{.*}}                 0 NOTYPE  GLOBAL DEFAULT [[#]] (.text) _etext
+# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] (.data) _edata
+# NOSDATA-DAG:  [[#]]: {{0*}}[[#BSS]]         0 NOTYPE  GLOBAL DEFAULT [[#]] (.bss) __bss_start
+# NOSDATA-DAG:  [[#]]: {{0*}}800              0 NOTYPE  GLOBAL DEFAULT  1 (.dynsym) __global_pointer$
 
 # CHECK:      .text
 # CHECK-NEXT: .tdata
@@ -43,11 +44,11 @@
 # CHECK-NEXT: .sbss      NOBITS   [[#%x,SBSS:]]
 # CHECK-NEXT: .bss
 
-# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] _edata
-# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] __bss_start
-# CHECK-DAG:  [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE  GLOBAL DEFAULT [[#]] __global_pointer$
+# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] (.sdata) _edata
+# CHECK-DAG:  [[#]]: {{0*}}[[#SBSS]]        0 NOTYPE  GLOBAL DEFAULT [[#]] (.sbss) __bss_start
+# CHECK-DAG:  [[#]]: {{0*}}[[#SDATA+0x800]] 0 NOTYPE  GLOBAL DEFAULT [[#]] (.sdata) __global_pointer$
 
-.globl _edata, __bss_start
+.globl _etext, _edata, __bss_start
   lla gp, __global_pointer$
 
 .section .data,"aw",@progbits; .long _GLOBAL_OFFSET_TABLE_ - .
diff --git a/lld/test/ELF/segments.s b/lld/test/ELF/segments.s
index d9af9a3ae5e7..1fe248afa884 100644
--- a/lld/test/ELF/segments.s
+++ b/lld/test/ELF/segments.s
@@ -1,157 +1,93 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
-
-# RUN: ld.lld %t -o %t1
-# RUN: llvm-readobj --program-headers %t1 | FileCheck --check-prefix=ROSEGMENT %s
-# RUN: ld.lld --no-rosegment --rosegment %t -o - | cmp - %t1
-# RUN: ld.lld --omagic --no-omagic %t -o - | cmp - %t1
-
-# ROSEGMENT:      ProgramHeader {
-# ROSEGMENT:        Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x0
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize:
-# ROSEGMENT-NEXT:    MemSize:
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
-# ROSEGMENT-NEXT:  ProgramHeader {
-# ROSEGMENT-NEXT:    Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x15C
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize:
-# ROSEGMENT-NEXT:    MemSize:
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:      PF_X
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
-# ROSEGMENT-NEXT:  ProgramHeader {
-# ROSEGMENT-NEXT:    Type: PT_LOAD
-# ROSEGMENT-NEXT:    Offset: 0x15E
-# ROSEGMENT-NEXT:    VirtualAddress:
-# ROSEGMENT-NEXT:    PhysicalAddress:
-# ROSEGMENT-NEXT:    FileSize: 1
-# ROSEGMENT-NEXT:    MemSize: 1
-# ROSEGMENT-NEXT:    Flags [
-# ROSEGMENT-NEXT:      PF_R
-# ROSEGMENT-NEXT:      PF_W
-# ROSEGMENT-NEXT:    ]
-# ROSEGMENT-NEXT:    Alignment: 4096
-# ROSEGMENT-NEXT:  }
-
-# RUN: ld.lld --no-rosegment %t -o %t2
-# RUN: llvm-readobj --program-headers %t2 | FileCheck --check-prefix=NOROSEGMENT %s
-
-# NOROSEGMENT:     ProgramHeader {
-# NOROSEGMENT:       Type: PT_LOAD
-# NOROSEGMENT-NEXT:   Offset: 0x0
-# NOROSEGMENT-NEXT:   VirtualAddress:
-# NOROSEGMENT-NEXT:   PhysicalAddress:
-# NOROSEGMENT-NEXT:   FileSize:
-# NOROSEGMENT-NEXT:   MemSize:
-# NOROSEGMENT-NEXT:   Flags [
-# NOROSEGMENT-NEXT:     PF_R
-# NOROSEGMENT-NEXT:     PF_X
-# NOROSEGMENT-NEXT:   ]
-# NOROSEGMENT-NEXT:   Alignment: 4096
-# NOROSEGMENT-NEXT: }
-# NOROSEGMENT-NEXT: ProgramHeader {
-# NOROSEGMENT-NEXT:   Type: PT_LOAD
-# NOROSEGMENT-NEXT:   Offset: 0x126
-# NOROSEGMENT-NEXT:   VirtualAddress:
-# NOROSEGMENT-NEXT:   PhysicalAddress:
-# NOROSEGMENT-NEXT:   FileSize:
-# NOROSEGMENT-NEXT:   MemSize:
-# NOROSEGMENT-NEXT:   Flags [
-# NOROSEGMENT-NEXT:     PF_R
-# NOROSEGMENT-NEXT:     PF_W
-# NOROSEGMENT-NEXT:   ]
-# NOROSEGMENT-NEXT:   Alignment: 4096
-# NOROSEGMENT-NEXT: }
-# NOROSEGMENT-NEXT: ProgramHeader {
-# NOROSEGMENT-NEXT:   Type: PT_GNU_STACK
-
-# RUN: ld.lld -N %t -o %t3
-# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s
-# RUN: ld.lld --omagic %t -o %t3
-# RUN: llvm-readobj --program-headers %t3 | FileCheck --check-prefix=OMAGIC %s
-
-# OMAGIC:     ProgramHeader {
-# OMAGIC:      Type: PT_LOAD
-# OMAGIC-NEXT:   Offset: 0xB0
-# OMAGIC-NEXT:   VirtualAddress:
-# OMAGIC-NEXT:   PhysicalAddress:
-# OMAGIC-NEXT:   FileSize:
-# OMAGIC-NEXT:   MemSize:
-# OMAGIC-NEXT:   Flags [
-# OMAGIC-NEXT:     PF_R
-# OMAGIC-NEXT:     PF_W
-# OMAGIC-NEXT:     PF_X
-# OMAGIC-NEXT:   ]
-# OMAGIC-NEXT:   Alignment: 4
-# OMAGIC-NEXT: }
-# OMAGIC-NEXT: ProgramHeader {
-# OMAGIC-NEXT:   Type: PT_GNU_STACK
-
-# RUN: ld.lld -n %t -o %t4
-# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s
-# RUN: ld.lld --nmagic %t -o %t4
-# RUN: llvm-readobj --program-headers %t4 | FileCheck --check-prefix=NMAGIC %s
-
-# NMAGIC:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD
-# NMAGIC-NEXT:     Offset: 0x120
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 1
-# NMAGIC-NEXT:     MemSize: 1
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 1
-# NMAGIC-NEXT:   }
-# NMAGIC-NEXT:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD
-# NMAGIC-NEXT:     Offset: 0x124
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 2
-# NMAGIC-NEXT:     MemSize: 2
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:       PF_X
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 4
-# NMAGIC-NEXT:   }
-# NMAGIC-NEXT:   ProgramHeader {
-# NMAGIC-NEXT:     Type: PT_LOAD (0x1)
-# NMAGIC-NEXT:     Offset: 0x126
-# NMAGIC-NEXT:     VirtualAddress:
-# NMAGIC-NEXT:     PhysicalAddress:
-# NMAGIC-NEXT:     FileSize: 1
-# NMAGIC-NEXT:     MemSize: 1
-# NMAGIC-NEXT:     Flags [
-# NMAGIC-NEXT:       PF_R
-# NMAGIC-NEXT:       PF_W
-# NMAGIC-NEXT:     ]
-# NMAGIC-NEXT:     Alignment: 1
-# NMAGIC-NEXT:   }
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
+# RUN: ld.lld a.o -o a
+# RUN: llvm-readelf -l a | FileCheck --check-prefix=ROSEGMENT %s
+# RUN: ld.lld --no-rosegment --rosegment a.o -o - | cmp - a
+# RUN: ld.lld --omagic --no-omagic a.o -o - | cmp - a
+
+# ROSEGMENT:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# ROSEGMENT-NEXT:  PHDR           0x000040 0x0000000000200040 0x0000000000200040 0x000118 0x000118 R   0x8
+# ROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x00015b 0x00015b R   0x1000
+# ROSEGMENT-NEXT:  LOAD           0x00015c 0x000000000020115c 0x000000000020115c 0x000003 0x000003 R E 0x1000
+# ROSEGMENT-NEXT:  LOAD           0x00015f 0x000000000020215f 0x000000000020215f 0x000002 0x000002 RW  0x1000
+# ROSEGMENT-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld --rosegment a.o -T a.lds -o ro1
+# RUN: llvm-readelf -l ro1 | FileCheck --check-prefix=ROSEGMENT1 %s
+
+# ROSEGMENT1:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# ROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001004 0x0000000000000004 0x0000000000000004 0x000002 0x000002 R E 0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW  0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000001 0x000001 R E 0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001008 0x0000000000000008 0x0000000000000008 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
+# ROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R   0x1000
+# ROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld --no-rosegment a.o -o noro
+# RUN: llvm-readelf -l noro | FileCheck --check-prefix=NOROSEGMENT %s
+
+# NOROSEGMENT:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOROSEGMENT-NEXT:  PHDR           0x000040 0x0000000000200040 0x0000000000200040 0x0000e0 0x0000e0 R   0x8
+# NOROSEGMENT-NEXT:  LOAD           0x000000 0x0000000000200000 0x0000000000200000 0x000127 0x000127 R E 0x1000
+# NOROSEGMENT-NEXT:  LOAD           0x000127 0x0000000000201127 0x0000000000201127 0x000002 0x000002 RW  0x1000
+# NOROSEGMENT-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld --no-rosegment a.o -T a.lds -o noro1
+# RUN: llvm-readelf -l noro1 | FileCheck --check-prefix=NOROSEGMENT1 %s
+
+# NOROSEGMENT1:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOROSEGMENT1-NEXT:  LOAD           0x001000 0x0000000000000000 0x0000000000000000 0x000006 0x000006 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001006 0x0000000000000006 0x0000000000000006 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001007 0x0000000000000007 0x0000000000000007 0x000002 0x000002 R E 0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x001009 0x0000000000000009 0x0000000000000009 0x000001 0x000001 RW  0x1000
+# NOROSEGMENT1-NEXT:  LOAD           0x00100a 0x000000000000000a 0x000000000000000a 0x000001 0x000001 R   0x1000
+# NOROSEGMENT1-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld -N a.o -o omagic
+# RUN: llvm-readelf -l omagic | FileCheck --check-prefix=OMAGIC %s
+# RUN: ld.lld --omagic a.o -o - | cmp - omagic
+
+# OMAGIC:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# OMAGIC-NEXT:  LOAD           0x0000b0 0x00000000002000b0 0x00000000002000b0 0x000009 0x000009 RWE 0x4
+# OMAGIC-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld -n a.o -o nmagic
+# RUN: llvm-readelf -l nmagic | FileCheck --check-prefix=NMAGIC %s
+# RUN: ld.lld --nmagic a.o -o - | cmp nmagic -
+
+# NMAGIC:       Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NMAGIC-NEXT:  LOAD           0x000120 0x0000000000200120 0x0000000000200120 0x000003 0x000003 R   0x1
+# NMAGIC-NEXT:  LOAD           0x000124 0x0000000000200124 0x0000000000200124 0x000003 0x000003 R E 0x4
+# NMAGIC-NEXT:  LOAD           0x000127 0x0000000000200127 0x0000000000200127 0x000002 0x000002 RW  0x1
+# NMAGIC-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+#--- a.s
 .global _start
 _start:
  nop
 
-.section .ro,"a"
-nop
+.section .ro1,"a"; .byte 1
+.section .rw1,"aw"; .byte 3
+.section .rx1,"ax"; .byte 2
+
+.section .ro2,"a"; .byte 1
+.section .rw2,"aw"; .byte 3
+.section .rx2,"ax"; .byte 2
 
-.section .rw,"aw"
-nop
+.section .ro3,"a"; .byte 1
 
-.section .rx,"ax"
-nop
+#--- a.lds
+SECTIONS {
+  .ro1 : {}
+  .text : {}
+  .rx1 : {}
+  .rw1 : {}
+  .rx2 : {}
+  .ro2 : {}
+  .rw2 : {}
+  .ro3 : {}
+}
diff --git a/lld/test/ELF/systemz-got.s b/lld/test/ELF/systemz-got.s
new file mode 100644
index 000000000000..1d558aa3b029
--- /dev/null
+++ b/lld/test/ELF/systemz-got.s
@@ -0,0 +1,16 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/shared.s -o %t2.o
+# RUN: ld.lld -shared %t2.o -soname=%t2.so -o %t2.so
+
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o %t2.so -o %t
+# RUN: llvm-readelf -S -r  %t | FileCheck %s
+
+# CHECK: .got              PROGBITS        {{.*}} {{.*}} 000020 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK: {{.*}}  000000010000000a R_390_GLOB_DAT         0000000000000000 bar + 0
+
+.global _start
+_start:
+	lgrl  %r1,bar@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-align.s b/lld/test/ELF/systemz-gotent-relax-align.s
new file mode 100644
index 000000000000..c6326086f56d
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-align.s
@@ -0,0 +1,48 @@
+# REQUIRES: systemz
+## Verify that R_390_GOTENT optimization is not performed on misaligned symbols.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-readelf -S -r -x .got -x .got.plt %t1 | FileCheck --check-prefixes=CHECK %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## We retain one .got entry for the unaligned symbol.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .got              PROGBITS        00000000010021e0 0001e0 000020 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002200 000200 000e00 00  WA  0   0  1
+# CHECK-NEXT: .data             PROGBITS        0000000001003200 000200 000006 00  WA  0   0  2
+
+# CHECK-LABEL: Hex dump of section '.got':
+# CHECK-NEXT:    0x010021e0 00000000 00000000 00000000 00000000
+# CHECK-NEXT:    0x010021f0 00000000 00000000 00000000 01003205
+
+# DISASM:      Disassembly of section .text:
+# DISASM:      <_start>:
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   larl    %r1, 0x1003200
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+# DISASM-NEXT:   lgrl    %r1, 0x10021f8
+
+.data
+.globl var_align
+.hidden var_align
+ .align 2
+var_align:
+ .long 0
+
+.data
+.globl var_unalign
+.hidden var_unalign
+ .align 2
+ .byte 0
+var_unalign:
+ .byte 0
+
+.text
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_align@GOT
+ lgrl %r1, var_unalign@GOT
+ lgrl %r1, var_unalign@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s
new file mode 100644
index 000000000000..57369a417fd4
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %S/Inputs/gotpc-relax-und-dso.s -o %tdso.o
+# RUN: ld.lld -shared %tdso.o -soname=t.so -o %t.so
+# RUN: ld.lld --hash-style=sysv -shared %t.o %t.so -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=RELOC %s
+# RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck --check-prefix=DISASM %s
+
+# RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 3 entries:
+# RELOC: 00000000000023f8 000000010000000a R_390_GLOB_DAT 00000000000012d8 foo + 0
+# RELOC: 0000000000002400 000000030000000a R_390_GLOB_DAT 0000000000000000 und + 0
+# RELOC: 0000000000002408 000000040000000a R_390_GLOB_DAT 0000000000000000 dsofoo + 0
+
+# DISASM:      Disassembly of section .text:
+# DISASM-EMPTY:
+# DISASM-NEXT: <foo>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <hid>:
+# DISASM-NEXT:     bc 0, 0
+# DISASM:      <_start>:
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2400
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    lgrl    %r1, 0x2408
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    larl    %r1, 0x12dc
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+# DISASM-NEXT:    lgrl    %r1, 0x23f8
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, und@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, dsofoo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s
new file mode 100644
index 000000000000..f665e1af9e53
--- /dev/null
+++ b/lld/test/ELF/systemz-gotent-relax.s
@@ -0,0 +1,91 @@
+# REQUIRES: systemz
+## Test R_390_GOTENT optimization.
+
+# RUN: llvm-mc -filetype=obj -relax-relocations -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,NOAPPLY %s
+# RUN: ld.lld %t.o -o %t1 --apply-dynamic-relocs
+# RUN: llvm-readelf -S -r -x .got.plt %t1 | FileCheck --check-prefixes=CHECK,APPLY %s
+# RUN: ld.lld %t.o -o %t1
+# RUN: llvm-objdump --no-print-imm-hex -d %t1 | FileCheck --check-prefix=DISASM %s
+
+## --no-relax disables GOT optimization.
+# RUN: ld.lld --no-relax %t.o -o %t2
+# RUN: llvm-objdump --no-print-imm-hex -d %t2 | FileCheck --check-prefix=NORELAX %s
+
+## In our implementation, .got is retained even if all GOT-generating relocations are optimized.
+# CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      .iplt             PROGBITS        0000000001001240 000240 000020 00  AX  0   0 16
+# CHECK-NEXT: .got              PROGBITS        0000000001002260 000260 000018 00  WA  0   0  8
+# CHECK-NEXT: .relro_padding    NOBITS          0000000001002278 000278 000d88 00  WA  0   0  1
+# CHECK-NEXT: .got.plt          PROGBITS        0000000001003278 000278 000008 00  WA  0   0  8
+
+## There is one R_S390_IRELATIVE relocation.
+# CHECK-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# CHECK:       0000000001003278  000000000000003d R_390_IRELATIVE                   10011e8
+
+# CHECK-LABEL: Hex dump of section '.got.plt':
+# NOAPPLY-NEXT:  0x01003278 00000000 00000000
+# APPLY-NEXT:    0x01003278 00000000 010011e8
+
+# DISASM:      Disassembly of section .text:
+# DISASM: 00000000010011e0 <foo>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e4 <hid>:
+# DISASM-NEXT:   bc      0, 0
+# DISASM: 00000000010011e8 <ifunc>:
+# DISASM-NEXT:   br      %r14
+# DISASM: 00000000010011ea <_start>:
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e0
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   larl    %r1, 0x10011e4
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+# DISASM-NEXT:   lgrl    %r1, 0x1003278
+
+# NORELAX-LABEL: <_start>:
+# NORELAX-COUNT-12: lgrl
+
+.text
+.globl foo
+
+.text
+.globl foo
+.type foo, @function
+foo:
+ nop
+
+.globl hid
+.hidden hid
+.type hid, @function
+hid:
+ nop
+
+.text
+.type ifunc STT_GNU_IFUNC
+.globl ifunc
+.type ifunc, @function
+ifunc:
+ br %r14
+
+.globl _start
+.type _start, @function
+_start:
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, foo@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, hid@GOT
+ lgrl %r1, ifunc@GOT
+ lgrl %r1, ifunc@GOT
diff --git a/lld/test/ELF/systemz-ifunc-nonpreemptible.s b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
new file mode 100644
index 000000000000..5056db302ca1
--- /dev/null
+++ b/lld/test/ELF/systemz-ifunc-nonpreemptible.s
@@ -0,0 +1,75 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-none-linux-gnu %s -o %t.o
+# RUN: ld.lld -static %t.o -o %t
+# RUN: ld.lld -static %t.o -o %t.apply --apply-dynamic-relocs
+# RUN: llvm-readelf --section-headers --relocations --symbols %t | FileCheck %s
+# RUN: llvm-readelf -x .got.plt %t | FileCheck %s --check-prefix=NO-APPLY-RELOC
+# RUN: llvm-readelf -x .got.plt %t.apply | FileCheck %s --check-prefix=APPLY-RELOC
+# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t | FileCheck %s --check-prefix=DISASM
+
+# CHECK:      Section Headers:
+# CHECK-NEXT:  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .rela.dyn         RELA            0000000001000158 000158 000030 18  AI  0   4  8
+# CHECK-NEXT:  [ 2] .text             PROGBITS        0000000001001188 000188 00001c 00  AX  0   0  4
+# CHECK-NEXT:  [ 3] .iplt             PROGBITS        00000000010011b0 0001b0 000040 00  AX  0   0 16
+# CHECK-NEXT:  [ 4] .got.plt          PROGBITS        00000000010021f0 0001f0 000010 00  WA  0   0  8
+
+# CHECK:      Relocation section '.rela.dyn' at offset 0x158 contains 2 entries:
+# CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# CHECK-NEXT: 00000000010021f0  000000000000003d R_390_IRELATIVE                   1001188
+# CHECK-NEXT: 00000000010021f8  000000000000003d R_390_IRELATIVE                   100118a
+
+# CHECK:      Symbol table '.symtab' contains 6 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000001000158     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_start
+# CHECK-NEXT:     2: 0000000001000188     0 NOTYPE  LOCAL  HIDDEN      1 __rela_iplt_end
+# CHECK-NEXT:     3: 0000000001001188     0 IFUNC   GLOBAL DEFAULT     2 foo
+# CHECK-NEXT:     4: 000000000100118a     0 IFUNC   GLOBAL DEFAULT     2 bar
+# CHECK-NEXT:     5: 000000000100118c     0 NOTYPE  GLOBAL DEFAULT     2 _start
+
+# NO-APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# NO-APPLY-RELOC-NEXT:     0x010021f0 00000000 00000000 00000000 00000000
+# NO-APPLY-RELOC-EMPTY:
+
+# APPLY-RELOC-LABEL:  Hex dump of section '.got.plt':
+# APPLY-RELOC-NEXT:     0x010021f0 00000000 01001188 00000000 0100118a
+# APPLY-RELOC-EMPTY:
+
+# DISASM: Disassembly of section .text:
+# DISASM: 0000000001001188 <foo>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118a <bar>:
+# DISASM-NEXT:  br      %r14
+# DISASM: 000000000100118c  <_start>:
+# DISASM-NEXT:  brasl   %r14, 0x10011b0
+# DISASM-NEXT:  brasl   %r14, 0x10011d0
+# DISASM-NEXT:  larl    %r2, 0x1000158
+# DISASM-NEXT:  larl    %r2, 0x1000188
+# DISASM: Disassembly of section .iplt:
+# DISASM: <.iplt>:
+# DISASM:        10011b0:       larl    %r1, 0x10021f0
+# DISASM-NEXT:   10011b6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011bc:       br      %r1
+# DISASM:        10011d0:       larl    %r1, 0x10021f8
+# DISASM-NEXT:   10011d6:       lg      %r1, 0(%r1)
+# DISASM-NEXT:   10011dc:       br      %r1
+
+.text
+.type foo STT_GNU_IFUNC
+.globl foo
+foo:
+ br %r14
+
+.type bar STT_GNU_IFUNC
+.globl bar
+bar:
+ br %r14
+
+.globl _start
+_start:
+ brasl %r14, foo@plt
+ brasl %r14, bar@plt
+ larl %r2, __rela_iplt_start
+ larl %r2, __rela_iplt_end
diff --git a/lld/test/ELF/systemz-init-padding.s b/lld/test/ELF/systemz-init-padding.s
new file mode 100644
index 000000000000..c56b98d43f1b
--- /dev/null
+++ b/lld/test/ELF/systemz-init-padding.s
@@ -0,0 +1,27 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %p/Inputs/systemz-init.s -o systemz-init.o
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -dynamic-linker /lib/ld64.so.1 %t.o systemz-init.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn -j .init %t | FileCheck %s
+
+# glibc < 2.39 used to align .init and .fini code at a 4-byte boundary.
+# When that happens, the linker must not pad the code with invalid
+# instructions, e.g. null bytes.
+	.section        .init,"ax",@progbits
+	brasl %r14, startup
+
+# CHECK:      <.init>:
+# CHECK-NEXT: brasl %r14,
+# CHECK-NEXT: bcr     0, %r7
+# CHECK-NEXT: lg %r4, 272(%r15)
+
+	.text
+	.globl startup
+	.p2align 4
+startup:
+	br %r14
+
+	.globl main
+	.p2align 4
+main:
+	br %r14
diff --git a/lld/test/ELF/systemz-pie.s b/lld/test/ELF/systemz-pie.s
new file mode 100644
index 000000000000..bb971a82fb8c
--- /dev/null
+++ b/lld/test/ELF/systemz-pie.s
@@ -0,0 +1,38 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t1.o
+
+## Check -pie.
+# RUN: ld.lld -pie %t1.o -o %t
+# RUN: llvm-readelf --file-headers --program-headers --dynamic %t | FileCheck %s
+
+# CHECK: ELF Header:
+# CHECK-NEXT:  Magic:   7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00
+# CHECK-NEXT:  Class:                             ELF64
+# CHECK-NEXT:  Data:                              2's complement, big endian
+# CHECK-NEXT:  Version:                           1 (current)
+# CHECK-NEXT:  OS/ABI:                            UNIX - System V
+# CHECK-NEXT:  ABI Version:                       0
+# CHECK-NEXT:  Type:                              DYN (Shared object file)
+# CHECK-NEXT:  Machine:                           IBM S/390
+# CHECK-NEXT:  Version:                           0x1
+
+# CHECK: Program Headers:
+# CHECK-NEXT:  Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  PHDR           0x000040 0x0000000000000040 0x0000000000000040 0x000188 0x000188 R   0x8
+# CHECK-NEXT:  LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x00020d 0x00020d R   0x1000
+# CHECK-NEXT:  LOAD           0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 RW  0x1000
+# CHECK-NEXT:  DYNAMIC        0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000090 RW  0x8
+# CHECK-NEXT:  GNU_RELRO      0x000210 0x0000000000002210 0x0000000000002210 0x000090 0x000df0 R   0x1
+# CHECK-NEXT:  GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0x0
+
+# CHECK: Dynamic section at offset 0x210 contains 9 entries:
+# CHECK-NEXT:   Tag                Type       Name/Value
+# CHECK-NEXT:   0x000000006ffffffb (FLAGS_1)  PIE
+
+## Check -nopie
+# RUN: ld.lld -no-pie %t1.o -o %t2
+# RUN: llvm-readelf --file-headers %t2 | FileCheck %s --check-prefix=NOPIE
+# NOPIE-NOT: Type: DYN
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s
new file mode 100644
index 000000000000..4669f01f5881
--- /dev/null
+++ b/lld/test/ELF/systemz-plt.s
@@ -0,0 +1,83 @@
+# REQUIRES: systemz
+# RUN: echo '.globl bar, weak; .type bar,@function; .type weak,@function; bar: weak:' > %t1.s
+
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t1.s -o %t1.o
+# RUN: ld.lld -shared %t1.o -soname=t1.so -o %t1.so
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o %t1.so -z separate-code -o %t
+# RUN: llvm-readelf -S -s -r -x .got.plt %t | FileCheck %s
+# RUN: llvm-objdump -d %t | FileCheck --check-prefixes=DIS %s
+
+# CHECK: Section Headers:
+# CHECK: .plt     PROGBITS 0000000001001020 001020 000060 00  AX  0   0 16
+# CHECK: .got     PROGBITS 00000000010020d0 0020d0 000018 00  WA  0   0  8
+# CHECK: .got.plt PROGBITS 00000000010030e8 0020e8 000010 00  WA  0   0  8
+
+# CHECK: Relocation section '.rela.plt' at offset {{.*}} contains 2 entries:
+# CHECK: 00000000010030e8 000000010000000b R_390_JMP_SLOT 0000000000000000 bar + 0
+# CHECK: 00000000010030f0 000000020000000b R_390_JMP_SLOT 0000000000000000 weak + 0
+
+## A canonical PLT has a non-zero st_value. bar and weak are called but their
+## addresses are not taken, so a canonical PLT is not necessary.
+# CHECK: Symbol table '.dynsym' contains 3 entries:
+# CHECK-NEXT:   Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:     1: 0000000000000000     0 FUNC    GLOBAL DEFAULT   UND bar
+# CHECK-NEXT:     2: 0000000000000000     0 FUNC    WEAK   DEFAULT   UND weak
+
+## The .got.plt slots relocated by .rela.plt point to .plt
+## This is required by glibc.
+# CHECK: Hex dump of section '.got.plt':
+# CHECK-NEXT: 0x010030e8 00000000 0100104e 00000000 0100106e
+
+# DIS: Disassembly of section .text:
+
+# DIS: 0000000001001000 <_start>:
+# DIS-NEXT: brasl	%r14, 0x1001012
+# DIS-NEXT: brasl	%r14, 0x1001040
+# DIS-NEXT: brasl	%r14, 0x1001060
+
+# DIS: 0000000001001012 <foo>:
+# DIS-NEXT: br	%r14
+
+# DIS: Disassembly of section .plt:
+
+# DIS: 0000000001001020 <.plt>:
+# DIS-NEXT: 1001020: e3 10 f0 38 00 24    	stg	%r1, 56(%r15)
+# DIS-NEXT: 1001026: c0 10 00 00 08 55          larl	%r1, 0x10020d0
+# DIS-NEXT: 100102c: d2 07 f0 30 10 08    	mvc	48(8,%r15), 8(%r1)
+# DIS-NEXT: 1001032: e3 10 10 10 00 04    	lg	%r1, 16(%r1)
+# DIS-NEXT: 1001038: 07 f1        	br	%r1
+# DIS-NEXT: 100103a: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103c: 07 00        	bcr	0, %r0
+# DIS-NEXT: 100103e: 07 00        	bcr	0, %r0
+# DIS-NEXT: 1001040: c0 10 00 00 10 54    	larl	%r1, 0x10030e8
+# DIS-NEXT: 1001046: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100104c: 07 f1        	br	%r1
+# DIS-NEXT: 100104e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001050: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001056: c0 f4 ff ff ff e5    	jg	0x1001020
+# DIS-NEXT: 100105c: 00 00        	<unknown>
+# DIS-NEXT: 100105e: 00 00        	<unknown>
+# DIS-NEXT: 1001060: c0 10 00 00 10 48    	larl	%r1, 0x10030f0
+# DIS-NEXT: 1001066: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
+# DIS-NEXT: 100106c: 07 f1        	br	%r1
+# DIS-NEXT: 100106e: 0d 10        	basr	%r1, 0
+# DIS-NEXT: 1001070: e3 10 10 0c 00 14    	lgf	%r1, 12(%r1)
+# DIS-NEXT: 1001076: c0 f4 ff ff ff d5    	jg	0x1001020
+# DIS-NEXT: 100107c: 00 00        	<unknown>
+# DIS-NEXT: 100107e: 00 18        	<unknown>
+
+.global _start, foo, bar
+.weak weak
+
+_start:
+  ## Use @plt to avoid generating direct references that would force
+  ## allocation of a canonical PLT entry.
+  brasl %r14, foo@plt
+  brasl %r14, bar@plt
+  brasl %r14, weak@plt
+
+## foo is local and non-preemptable, no PLT is generated.
+foo:
+  br %r14
diff --git a/lld/test/ELF/systemz-reloc-abs.s b/lld/test/ELF/systemz-reloc-abs.s
new file mode 100644
index 000000000000..b5ad94d90d3a
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-abs.s
@@ -0,0 +1,32 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs255.s -o %t255.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs256.s -o %t256.o
+# RUN: llvm-mc -filetype=obj -triple=s390x %S/Inputs/abs257.s -o %t257.o
+
+# RUN: ld.lld %t.o %t256.o -o %t
+# RUN: llvm-readelf -x .data %t | FileCheck %s
+# CHECK: 0x{{[0-9a-f]+}} ff80ffff 8000ffff ffff8000 0000ffff
+# CHECK-NEXT:            ffffffff ffff8000 00000000 0000
+
+# RUN: not ld.lld %t.o %t255.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW1 %s
+# OVERFLOW1: relocation R_390_8 out of range: -129 is not in [-128, 255]
+# OVERFLOW1: relocation R_390_16 out of range: -32769 is not in [-32768, 65535]
+# OVERFLOW1: relocation R_390_32 out of range: -2147483649 is not in [-2147483648, 4294967295]
+
+# RUN: not ld.lld %t.o %t257.o -o /dev/null 2>&1 | FileCheck --check-prefix=OVERFLOW2 %s
+# OVERFLOW2: relocation R_390_8 out of range: 256 is not in [-128, 255]
+# OVERFLOW2: relocation R_390_16 out of range: 65536 is not in [-32768, 65535]
+# OVERFLOW2: relocation R_390_32 out of range: 4294967296 is not in [-2147483648, 4294967295]
+
+.globl _start
+_start:
+.data
+.byte foo - 1
+.byte foo - 384
+.word foo + 0xfeff
+.word foo - 0x8100
+.long foo + 0xfffffeff
+.long foo - 0x80000100
+.quad foo + 0xfffffffffffffeff
+.quad foo - 0x8000000000000100
diff --git a/lld/test/ELF/systemz-reloc-disp12.s b/lld/test/ELF/systemz-reloc-disp12.s
new file mode 100644
index 000000000000..3d32707d149f
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp12.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=291 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4095 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=4096 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_12 out of range: 4096 is not in [0, 4095]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN=58678123 --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN=58678fff --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_12, DISP
+    l %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-disp20.s b/lld/test/ELF/systemz-reloc-disp20.s
new file mode 100644
index 000000000000..88cd657c6ae3
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-disp20.s
@@ -0,0 +1,21 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=74565 %s -o %t1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524287 %s -o %t2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym DISP=524288 %s -o %t3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t1.o -o %t1out
+# RUN: ld.lld --section-start=.text=0x0 %t2.o -o %t2out
+# RUN: not ld.lld --section-start=.text=0x0 %t3.o -o /dev/null 2>&1 | FileCheck %s --check-prefix RANGE
+
+# RANGE: relocation R_390_20 out of range: 524288 is not in [-524288, 524287]
+
+# RUN: llvm-readelf --hex-dump=.text %t1out | FileCheck %s -DINSN="e3678345 1204" --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t2out | FileCheck %s -DINSN="e3678fff 7f04" --check-prefix DUMP
+
+# DUMP:  0x00000000 [[INSN]]
+
+.text
+.globl _start
+_start:
+    .reloc .+2, R_390_20, DISP
+    lg %r6, 0(%r7,%r8)
diff --git a/lld/test/ELF/systemz-reloc-got.s b/lld/test/ELF/systemz-reloc-got.s
new file mode 100644
index 000000000000..4b9ac16481f4
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-got.s
@@ -0,0 +1,92 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -z norelro -shared %t.o -soname=t.so -o %t.so
+## Note: Without norelro the distance between .got and .got.plt causes
+## R_390_GOTPLT12 relocations to always overflow.
+
+# RUN: llvm-readelf -S -x .data %t.so | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s --check-prefix=DISASM
+
+# CHECK: Section Headers:
+# CHECK: .got PROGBITS 0000000000002458
+# CHECK: .got.plt PROGBITS 0000000000002480
+
+## Note: _GLOBAL_OFFSET_TABLE is at .got
+## GOT (foo) is at .got + 24 == 0x2470
+## GOT (bar) is at .got + 32 == 0x2478
+## GOTPLT (foo) is at .got.plt + 0 == .got + 40 == 0x2480
+## GOTPLT (bar) is at .got.plt + 8 == .got + 48 == 0x2488
+
+# DISASM:      larl %r12, 0x2458
+# DISASM-NEXT: larl %r1, 0x2470
+# DISASM-NEXT: larl %r1, 0x2478
+# DISASM-NEXT: larl %r1, 0x2480
+# DISASM-NEXT: larl %r1, 0x2488
+
+# DISASM-NEXT: l %r1, 24(%r12)
+# DISASM-NEXT: l %r1, 32(%r12)
+# DISASM-NEXT: l %r1, 40(%r12)
+# DISASM-NEXT: l %r1, 48(%r12)
+# DISASM-NEXT: lg %r1, 24(%r12)
+# DISASM-NEXT: lg %r1, 32(%r12)
+# DISASM-NEXT: lg %r1, 40(%r12)
+# DISASM-NEXT: lg %r1, 48(%r12)
+
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: 00180020 00280030 00000018 00000020
+# CHECK-NEXT: 00000028 00000030 00000000 00000018
+# CHECK-NEXT: 00000000 00000020 00000000 00000028
+# CHECK-NEXT: 00000000 00000030
+
+.text
+larl %r12, _GLOBAL_OFFSET_TABLE_
+.reloc .+2, R_390_GOTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, foo+2
+larl %r1, 0
+.reloc .+2, R_390_GOTPLTENT, bar+2
+larl %r1, 0
+.reloc .+2, R_390_GOT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, foo
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT12, bar
+l %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOT20, bar
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, foo
+lg %r1, 0(%r12)
+.reloc .+2, R_390_GOTPLT20, bar
+lg %r1, 0(%r12)
+
+.data
+.reloc ., R_390_GOT16, foo
+.space 2
+.reloc ., R_390_GOT16, bar
+.space 2
+.reloc ., R_390_GOTPLT16, foo
+.space 2
+.reloc ., R_390_GOTPLT16, bar
+.space 2
+.reloc ., R_390_GOT32, foo
+.space 4
+.reloc ., R_390_GOT32, bar
+.space 4
+.reloc ., R_390_GOTPLT32, foo
+.space 4
+.reloc ., R_390_GOTPLT32, bar
+.space 4
+.reloc ., R_390_GOT64, foo
+.space 8
+.reloc ., R_390_GOT64, bar
+.space 8
+.reloc ., R_390_GOTPLT64, foo
+.space 8
+.reloc ., R_390_GOTPLT64, bar
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-gotrel.s b/lld/test/ELF/systemz-reloc-gotrel.s
new file mode 100644
index 000000000000..46669ecfa7fd
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-gotrel.s
@@ -0,0 +1,36 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: ld.lld -shared %t.o -soname=t.so -o %t.so
+
+# RUN: llvm-readelf -S -s -x .data %t.so | FileCheck %s
+
+# CHECK: Section Headers:
+# CHECK: .plt PROGBITS 0000000000001290
+# CHECK: .got PROGBITS 0000000000002390
+
+# CHECK: Symbol table '.symtab'
+# CHECK: 0000000000001288 {{.*}}  bar
+
+## Note: foo is the first (and only) PLT entry, which resides at .plt + 32
+## PLTOFF (foo) is (.plt + 32) - .got == 0x12b0 - 0x2390 == 0xffffef20
+## GOTOFF (bar) is bar - .got == 0x1288 - 0x2390 == 0xffffeef8
+# CHECK: Hex dump of section '.data':
+# CHECK-NEXT: eef8ef20 ffffeef8 ffffef20 ffffffff
+# CHECK-NEXT: ffffeef8 ffffffff ffffef20
+
+bar:
+  br %r14
+
+.data
+.reloc ., R_390_GOTOFF16, bar
+.space 2
+.reloc ., R_390_PLTOFF16, foo
+.space 2
+.reloc ., R_390_GOTOFF, bar
+.space 4
+.reloc ., R_390_PLTOFF32, foo
+.space 4
+.reloc ., R_390_GOTOFF64, bar
+.space 8
+.reloc ., R_390_PLTOFF64, foo
+.space 8
diff --git a/lld/test/ELF/systemz-reloc-pc16.s b/lld/test/ELF/systemz-reloc-pc16.s
new file mode 100644
index 000000000000..e1dad5af239d
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc16.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC16 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32768 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=32769 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC16 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=8000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .word _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC16, .text-ADDEND
+    .space 2
diff --git a/lld/test/ELF/systemz-reloc-pc32.s b/lld/test/ELF/systemz-reloc-pc32.s
new file mode 100644
index 000000000000..0cb9322eb1c1
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pc32.s
@@ -0,0 +1,39 @@
+# REQUIRES: systemz
+# RUN: rm -rf %t && split-file %s %t
+
+## Check recompile with -fPIC error message
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %t/shared.s -o %t/shared.o
+# RUN: not ld.lld -shared %t/shared.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: relocation R_390_PC32 cannot be used against symbol '_shared'; recompile with -fPIC
+# CHECK: >>> defined in {{.*}}
+# CHECK: >>> referenced by {{.*}}:(.data+0x1)
+
+## Check patching of negative addends
+
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=1 %t/addend.s -o %t/1.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483648 %t/addend.s -o %t/2.o
+# RUN: llvm-mc -filetype=obj -triple=s390x -defsym ADDEND=2147483649 %t/addend.s -o %t/3.o
+
+# RUN: ld.lld --section-start=.text=0x0 %t/1.o -o %t/1out
+# RUN: ld.lld --section-start=.text=0x0 %t/2.o -o %t/2out
+# RUN: not ld.lld --section-start=.text=0x0 %t/3.o -o /dev/null 2>&1 | FileCheck %s -DFILE=%t/3.o --check-prefix RANGE
+
+# RANGE: error: [[FILE]]:(.text+0x0): relocation R_390_PC32 out of range
+
+# RUN: llvm-readelf --hex-dump=.text %t/1out | FileCheck %s -DADDEND=ffffffff --check-prefix DUMP
+# RUN: llvm-readelf --hex-dump=.text %t/2out | FileCheck %s -DADDEND=80000000 --check-prefix DUMP
+
+# DUMP:  0x00000000 [[ADDEND]]
+
+#--- shared.s
+.data
+ .byte 0xe8
+ .long _shared - .
+
+#--- addend.s
+.text
+.globl _start
+_start:
+    .reloc ., R_390_PC32, .text-ADDEND
+    .space 4
diff --git a/lld/test/ELF/systemz-reloc-pcdbl.s b/lld/test/ELF/systemz-reloc-pcdbl.s
new file mode 100644
index 000000000000..faee756f5e95
--- /dev/null
+++ b/lld/test/ELF/systemz-reloc-pcdbl.s
@@ -0,0 +1,68 @@
+# REQUIRES: systemz
+
+# RUN: llvm-mc --filetype=obj --triple=s390x-unknown-linux -mcpu=z13 %s -o %t.o
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+4 --defsym bar16=pc16dbl --defsym foo32=pc32dbl+6 --defsym bar32=pc32dbl --defsym foo12=pc12dbl+6 --defsym bar12=pc12dbl --defsym foo24=pc24dbl+6 --defsym bar24=pc24dbl -o %t
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t | FileCheck %s --check-prefix=CHECK
+# CHECK: 0000000001001120 <pc16dbl>:
+# CHECK: je      0x1001124
+# CHECK: jne     0x1001120
+# CHECK: 0000000001001128 <pc32dbl>:
+# CHECK: jge     0x100112e
+# CHECK: jgne    0x1001128
+# CHECK: 0000000001001134 <pc12dbl>:
+# CHECK: bprp    5, 0x100113a, 0x1001134
+# CHECK: bprp    6, 0x1001134, 0x100113a
+# CHECK: 0000000001001140 <pc24dbl>:
+# CHECK: bprp    5, 0x1001140, 0x1001146
+# CHECK: bprp    6, 0x1001146, 0x1001140
+
+# RUN: ld.lld %t.o --defsym foo16=pc16dbl+0xfffe --defsym bar16=pc16dbl+4-0x10000 --defsym foo32=pc32dbl+0xfffffffe --defsym bar32=pc32dbl+6-0x100000000 --defsym foo12=pc12dbl+0xffe --defsym bar12=pc12dbl+6-0x1000 --defsym foo24=pc24dbl+0xfffffe --defsym bar24=pc24dbl+6-0x1000000 -o %t.limits
+# RUN: llvm-objdump --no-show-raw-insn --mcpu=z13 -d %t.limits | FileCheck %s --check-prefix=LIMITS
+# LIMITS:      je   0x101111e
+# LIMITS-NEXT: jne  0xff1124
+# LIMITS:      jge  0x101001126
+# LIMITS-NEXT: jgne 0xffffffff0100112e
+# LIMITS:      bprp 5, 0x1002132, 0x1001134
+# LIMITS-NEXT: bprp 6, 0x100013a, 0x100113a
+# LIMITS:      bprp 5, 0x1001140, 0x200113e
+# LIMITS-NEXT: bprp 6, 0x1001146, 0x1146
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+0x10000 --defsym bar16=pc16dbl+4-0x10002 --defsym foo32=pc32dbl+0x100000000 --defsym bar32=pc32dbl+6-0x100000002 --defsym foo12=pc12dbl+0x1000 --defsym bar12=pc12dbl+6-0x1002 --defsym foo24=pc24dbl+0x1000000 --defsym bar24=pc24dbl+6-0x1000002 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x2): relocation R_390_PC16DBL out of range: 65536 is not in [-65536, 65535]; references 'foo16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x6): relocation R_390_PC16DBL out of range: -65538 is not in [-65536, 65535]; references 'bar16'
+# ERROR-RANGE: error: [[FILE]]:(.text+0xa): relocation R_390_PC32DBL out of range: 4294967296 is not in [-4294967296, 4294967295]; references 'foo32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x10): relocation R_390_PC32DBL out of range: -4294967298 is not in [-4294967296, 4294967295]; references 'bar32'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x15): relocation R_390_PC12DBL out of range: 4096 is not in [-4096, 4095]; references 'foo12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x1b): relocation R_390_PC12DBL out of range: -4098 is not in [-4096, 4095]; references 'bar12'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x23): relocation R_390_PC24DBL out of range: 16777216 is not in [-16777216, 16777215]; references 'foo24'
+# ERROR-RANGE: error: [[FILE]]:(.text+0x29): relocation R_390_PC24DBL out of range: -16777218 is not in [-16777216, 16777215]; references 'bar24'
+
+# RUN: not ld.lld %t.o --defsym foo16=pc16dbl+1 --defsym bar16=pc16dbl-1 --defsym foo32=pc32dbl+1 --defsym bar32=pc32dbl-1 --defsym foo12=pc12dbl+1 --defsym bar12=pc12dbl-1 --defsym foo24=pc24dbl+1 --defsym bar24=pc24dbl-1 -o /dev/null 2>&1 | FileCheck -DFILE=%t.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN:      error: [[FILE]]:(.text+0x2): improper alignment for relocation R_390_PC16DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x6): improper alignment for relocation R_390_PC16DBL: 0xFFFFFFFFFFFFFFFB is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0xa): improper alignment for relocation R_390_PC32DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x10): improper alignment for relocation R_390_PC32DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x15): improper alignment for relocation R_390_PC12DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x1b): improper alignment for relocation R_390_PC12DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x23): improper alignment for relocation R_390_PC24DBL: 0x1 is not aligned to 2 bytes
+# ERROR-ALIGN-NEXT: error: [[FILE]]:(.text+0x29): improper alignment for relocation R_390_PC24DBL: 0xFFFFFFFFFFFFFFF9 is not aligned to 2 bytes
+
+.global _start
+.global pc16dbl
+.global pc32dbl
+.global pc12dbl
+.global pc24dbl
+_start:
+pc16dbl:
+     je   foo16
+     jne  bar16
+pc32dbl:
+     jge  foo32
+     jgne bar32
+pc12dbl:
+     bprp 5,foo12,0
+     bprp 6,bar12,0
+pc24dbl:
+     bprp 5,0,foo24
+     bprp 6,0,bar24
diff --git a/lld/test/ELF/systemz-tls-gd.s b/lld/test/ELF/systemz-tls-gd.s
new file mode 100644
index 000000000000..3976f55a6ae3
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-gd.s
@@ -0,0 +1,142 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+# RUN: echo '.tbss; .globl b, c; b: .zero 4; c:' | llvm-mc -filetype=obj -triple=s390x-unknown-linux - -o %t1.o
+# RUN: ld.lld -shared -soname=t1.so %t1.o -o %t1.so
+
+# RUN: ld.lld -shared %t.o %t1.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=GD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=GD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=GD-DATA %s
+
+# RUN: ld.lld %t.o %t1.o -o %t.le
+# RUN: llvm-readelf -r %t.le | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.le | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.le | FileCheck --check-prefix=LE-DATA %s
+
+# RUN: ld.lld %t.o %t1.so -o %t.ie
+# RUN: llvm-readelf -r %t.ie | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.ie | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.ie | FileCheck --check-prefix=IE-DATA %s
+
+# GD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 6 entries:
+# GD-REL:      0000000000002570 0000000200000036 R_390_TLS_DTPMOD 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002578 0000000200000037 R_390_TLS_DTPOFF 0000000000000008 a + 0
+# GD-REL-NEXT: 0000000000002580 0000000300000036 R_390_TLS_DTPMOD 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002588 0000000300000037 R_390_TLS_DTPOFF 000000000000000c b + 0
+# GD-REL-NEXT: 0000000000002590 0000000400000036 R_390_TLS_DTPMOD 0000000000000010 c + 0
+# GD-REL-NEXT: 0000000000002598 0000000400000037 R_390_TLS_DTPOFF 0000000000000010 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x2558
+# GD:      larl    %r12, 0x2558
+
+## GOT offset of the TLS module ID / offset pair for a is at 0x2460
+# GD-NEXT: lgrl    %r2, 0x2460
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for b is at 0x2468
+# GD-NEXT: lgrl    %r2, 0x2468
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TLS module ID / offset pair for c is at 0x2470
+# GD-NEXT: lgrl    %r2, 0x2470
+# GD-NEXT: brasl   %r14, 0x1440
+# GD-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## Constant pool holding GOT offsets of TLS module ID / offset pairs:
+# a: 0x2570 / 0x18
+# b: 0x2580 / 0x28
+# c: 0x2590 / 0x38
+# GD-DATA:      2460 00000000 00000018 00000000 00000028
+# GD-DATA-NEXT: 2470 00000000 00000038
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## TP offset of a is at 0x1002218
+# LE-NEXT: lgrl    %r2, 0x1002218
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of b is at 0x1002220
+# LE-NEXT: lgrl    %r2, 0x1002220
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offset of c is at 0x1002228
+# LE-NEXT: lgrl    %r2, 0x1002228
+# LE-NEXT: brcl    0,
+# LE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA:      1002218 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA-NEXT: 1002228 00000000 00000000
+
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 2 entries:
+# IE-REL:      0000000001002430 0000000200000038 R_390_TLS_TPOFF 0000000000000000 b + 0
+# IE-REL-NEXT: 0000000001002438 0000000300000038 R_390_TLS_TPOFF 0000000000000000 c + 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002418
+# IE:      larl    %r12, 0x1002418
+
+## TP offset of a is at 0x1002340
+# IE-NEXT: lgrl    %r2, 0x1002340
+# IE-NEXT: brcl    0,
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for b is at 0x1002348
+# IE-NEXT: lgrl    %r2, 0x1002348
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## GOT offset of the TP offset for c is at 0x1002350
+# IE-NEXT: lgrl    %r2, 0x1002350
+# IE-NEXT: lg      %r2, 0(%r2,%r12)
+# IE-NEXT: lgf     %r2, 0(%r2,%r7)
+
+## TP offsets (a) / GOT offset of TP offsets (b, c)
+# a: -4
+# b: 0x1002430 / 0x18
+# c: 0x1002438 / 0x20
+# IE-DATA:      1002340 ffffffff fffffffc 00000000 00000018
+# IE-DATA-NEXT: 1002350 00000000 00000020
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:a
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC1
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:b
+lgf     %r2,0(%r2,%r7)
+
+lgrl    %r2,.LC2
+brasl   %r14,__tls_get_offset@PLT:tls_gdcall:c
+lgf     %r2,0(%r2,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSGD
+.LC1:
+        .quad   b@TLSGD
+.LC2:
+        .quad   c@TLSGD
+
+	.section .tbss
+	.globl a
+	.zero 8
+a:
+	.zero 4
diff --git a/lld/test/ELF/systemz-tls-ie.s b/lld/test/ELF/systemz-tls-ie.s
new file mode 100644
index 000000000000..27b642ed2dfc
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ie.s
@@ -0,0 +1,87 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=IE-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=IE %s
+# RUN: llvm-objdump --section .data --full-contents %t.so | FileCheck --check-prefix=IE-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+# RUN: llvm-objdump --section .got --full-contents %t | FileCheck --check-prefix=LE-GOT %s
+
+# IE-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 4 entries:
+# IE-REL: 0000000000003478 000000000000000c R_390_RELATIVE 2460
+# IE-REL: 0000000000002460 0000000100000038 R_390_TLS_TPOFF 0000000000000008 a + 0
+# IE-REL: 0000000000002468 0000000200000038 R_390_TLS_TPOFF 000000000000000c b + 0
+# IE-REL: 0000000000002470 0000000300000038 R_390_TLS_TPOFF 0000000000000010 c + 0
+
+## TP offset for a is at 0x2460
+# IE:      lgrl    %r1, 0x2460
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x2468
+# IE-NEXT: lgrl    %r1, 0x2468
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x2470
+# IE-NEXT: lgrl    %r1, 0x2470
+# IE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x2460 (relocated via R_390_RELATIVE above)
+# IE-DATA: 3478 00000000 00000000
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002250
+# LE:      lgrl    %r1, 0x1002250
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002258
+# LE-NEXT: lgrl    %r1, 0x1002258
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002260
+# LE-NEXT: lgrl    %r1, 0x1002260
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## Data element: TP offset for a is at 0x1002250
+# LE-DATA: 00000000 01002250
+
+## TP offsets in GOT:
+# a: -8
+# b: -4
+# c: 0
+# LE-GOT: 1002238 00000000 00000000 00000000 00000000
+# LE-GOT: 1002248 00000000 00000000 ffffffff fffffff8
+# LE-GOT: 1002258 ffffffff fffffffc 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, a@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, b@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, c@indntpoff
+lgf     %r1,0(%r1,%r7)
+
+	.data
+	.reloc .,R_390_TLS_IE64,a
+	.space 8
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-ld.s b/lld/test/ELF/systemz-tls-ld.s
new file mode 100644
index 000000000000..2cb36d7294f2
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-ld.s
@@ -0,0 +1,114 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -r %t.so | FileCheck --check-prefix=LD-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck --check-prefix=LD %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t.so | FileCheck --check-prefix=LD-DATA %s
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# LD-REL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
+# LD-REL: 00000000000024f8 0000000000000036 R_390_TLS_DTPMOD 0
+
+## _GLOBAL_OFFSET_TABLE is at 0x24e0
+# LD:      larl    %r12, 0x24e0
+
+## GOT offset of the LDM TLS module ID is at 0x23e0
+# LD-NEXT: lgrl    %r2, 0x23e0
+# LD-NEXT: brasl   %r14, 0x13c0
+# LD-NEXT: la      %r2, 0(%r2,%r7)
+
+## DTP offset for a is at 0x23e8
+# LD-NEXT: lgrl    %r1, 0x23e8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for b is at 0x23f0
+# LD-NEXT: lgrl    %r1, 0x23f0
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## DTP offset for c is at 0x23f8
+# LD-NEXT: lgrl    %r1, 0x23f8
+# LD-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## Constant pool holding GOT offsets of TLS module ID and DTP offsets:
+# TLS module ID: 0x24f8 / 0x18
+# a: 8
+# b: 12
+# c: 16
+# LD-DATA: 23e0 00000000 00000018 00000000 00000008
+# LD-DATA: 23f0 00000000 0000000c 00000000 00000010
+
+# NOREL: no relocations
+
+## _GLOBAL_OFFSET_TABLE is at 0x1002230
+# LE:      larl    %r12, 0x1002230
+
+## GOT offset of the LDM TLS module ID is at 0x1002210
+# LE-NEXT: lgrl    %r2, 0x1002210
+# LE-NEXT: brcl    0,
+# LE-NEXT: la      %r2, 0(%r2,%r7)
+
+## TP offset for a is at 0x1002218
+# LE-NEXT: lgrl    %r1, 0x1002218
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for b is at 0x1002220
+# LE-NEXT: lgrl    %r1, 0x1002220
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## TP offset for c is at 0x1002228
+# LE-NEXT: lgrl    %r1, 0x1002228
+# LE-NEXT: lgf     %r1, 0(%r1,%r2)
+
+## zeroed LDM / TP offsets:
+# LDM TLS: 0
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002210 00000000 00000000 ffffffff fffffff8
+# LE-DATA: 1002220 ffffffff fffffffc 00000000 00000000
+
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+larl    %r12,_GLOBAL_OFFSET_TABLE_
+
+lgrl    %r2,.LC0
+brasl   %r14,__tls_get_offset@PLT:tls_ldcall:a
+la      %r2,0(%r2,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r2)
+
+lgrl    %r1, .LC3
+lgf     %r1,0(%r1,%r2)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@TLSLDM
+.LC1:
+        .quad   a@DTPOFF
+.LC2:
+        .quad   b@DTPOFF
+.LC3:
+        .quad   c@DTPOFF
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/systemz-tls-le.s b/lld/test/ELF/systemz-tls-le.s
new file mode 100644
index 000000000000..9e41fc768da3
--- /dev/null
+++ b/lld/test/ELF/systemz-tls-le.s
@@ -0,0 +1,61 @@
+# REQUIRES: systemz
+# RUN: llvm-mc -filetype=obj -triple=s390x-unknown-linux %s -o %t.o
+
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=LE %s
+# RUN: llvm-objdump --section .data.rel.ro --full-contents %t | FileCheck --check-prefix=LE-DATA %s
+
+# NOREL: no relocations
+
+## TP offset for a is at 0x1002200
+# LE:      lgrl    %r1, 0x1002200
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for b is at 0x1002208
+# LE-NEXT: lgrl    %r1, 0x1002208
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offset for c is at 0x1002210
+# LE-NEXT: lgrl    %r1, 0x1002210
+# LE-NEXT: lgf     %r1, 0(%r1,%r7)
+
+## TP offsets:
+# a: -8
+# b: -4
+# c: 0
+# LE-DATA: 1002200 ffffffff fffffff8 ffffffff fffffffc
+# LE-DATA: 1002210 00000000 00000000
+
+ear     %r7,%a0
+sllg    %r7,%r1,32
+ear     %r7,%a1
+
+lgrl    %r1, .LC0
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC1
+lgf     %r1,0(%r1,%r7)
+
+lgrl    %r1, .LC2
+lgf     %r1,0(%r1,%r7)
+
+        .section        .data.rel.ro,"aw"
+        .align  8
+.LC0:
+        .quad   a@ntpoff
+.LC1:
+        .quad   b@ntpoff
+.LC2:
+        .quad   c@ntpoff
+
+	.section .tbss
+	.globl a
+	.globl b
+	.globl c
+	.zero 8
+a:
+	.zero 4
+b:
+	.zero 4
+c:
diff --git a/lld/test/ELF/unsupported-emachine.test b/lld/test/ELF/unsupported-emachine.test
new file mode 100644
index 000000000000..43d907ea72d5
--- /dev/null
+++ b/lld/test/ELF/unsupported-emachine.test
@@ -0,0 +1,11 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: unsupported e_machine value: 9
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_S370
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index 37201279fa0a..0ba605393893 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -6,31 +6,31 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 --defsym=BSS=1 a.s -o a.o
 # RUN: ld.lld --section-start=.note=0x200300 a.o -o a
-# RUN: llvm-readelf -S -l a | FileCheck %s
+# RUN: llvm-readelf -S -l -sX a | FileCheck %s
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a1.o
 # RUN: ld.lld --section-start=.note=0x200300 a1.o -o a1
-# RUN: llvm-readelf -S a1 | FileCheck %s --check-prefix=CHECK1
+# RUN: llvm-readelf -S -sX a1 | FileCheck %s --check-prefix=CHECK1
 
 # RUN: ld.lld -T b.lds -z norelro a.o -o b
 # RUN: llvm-readelf -S -l b | FileCheck %s --check-prefix=CHECK2
 
-# CHECK:       Name       Type            Address          Off    Size   ES Flg Lk Inf Al
-# CHECK-NEXT:             NULL            0000000000000000 000000 000000 00      0   0  0
-# CHECK-NEXT:  .note      NOTE            0000000000200300 000300 000001 00   A  0   0  1
-# CHECK-NEXT:  .lrodata   PROGBITS        0000000000200301 000301 000002 00  Al  0   0  1
-# CHECK-NEXT:  .rodata    PROGBITS        0000000000200303 000303 000001 00   A  0   0  1
-# CHECK-NEXT:  .text      PROGBITS        0000000000201304 000304 000001 00  AX  0   0  4
-# CHECK-NEXT:  .tdata     PROGBITS        0000000000202305 000305 000001 00 WAT  0   0  1
-# CHECK-NEXT:  .tbss      NOBITS          0000000000202306 000306 000002 00 WAT  0   0  1
-# CHECK-NEXT:  .relro_padding NOBITS      0000000000202306 000306 000cfa 00  WA  0   0  1
-# CHECK-NEXT:  .data      PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
-# CHECK-NEXT:  .bss       NOBITS          0000000000203307 000307 001800 00  WA  0   0  1
+# CHECK:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
+# CHECK-NEXT:  .lrodata          PROGBITS        0000000000200301 000301 000002 00  Al  0   0  1
+# CHECK-NEXT:  .rodata           PROGBITS        0000000000200303 000303 000001 00   A  0   0  1
+# CHECK-NEXT:  .text             PROGBITS        0000000000201304 000304 000001 00  AX  0   0  4
+# CHECK-NEXT:  .tdata            PROGBITS        0000000000202305 000305 000001 00 WAT  0   0  1
+# CHECK-NEXT:  .tbss             NOBITS          0000000000202306 000306 000002 00 WAT  0   0  1
+# CHECK-NEXT:  .relro_padding    NOBITS          0000000000202306 000306 000cfa 00  WA  0   0  1
+# CHECK-NEXT:  .data             PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
+# CHECK-NEXT:  .bss              NOBITS          0000000000203307 000307 001800 00  WA  0   0  1
 ## We spend size(.bss) % MAXPAGESIZE bytes for .bss.
-# CHECK-NEXT:  .ldata     PROGBITS        0000000000205b07 000b07 000002 00 WAl  0   0  1
-# CHECK-NEXT:  .ldata2    PROGBITS        0000000000205b09 000b09 000001 00 WAl  0   0  1
-# CHECK-NEXT:  .lbss      NOBITS          0000000000205b0a 000b0a 000002 00 WAl  0   0  1
-# CHECK-NEXT:  .comment   PROGBITS        0000000000000000 000b0a {{.*}} 01  MS  0   0  1
+# CHECK-NEXT:  .ldata            PROGBITS        0000000000205b07 000b07 000002 00 WAl  0   0  1
+# CHECK-NEXT:  .ldata2           PROGBITS        0000000000205b09 000b09 000001 00 WAl  0   0  1
+# CHECK-NEXT:  .lbss             NOBITS          0000000000205b0a 000b0a 001201 00 WAl  0   0  1
+# CHECK-NEXT:  .comment          PROGBITS        0000000000000000 000b0a {{.*}} 01  MS  0   0  1
 
 # CHECK:       Program Headers:
 # CHECK-NEXT:    Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
@@ -39,13 +39,23 @@
 # CHECK-NEXT:    LOAD  0x000304 0x0000000000201304 0x0000000000201304 0x000001 0x000001 R E 0x1000
 # CHECK-NEXT:    LOAD  0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000cfb RW  0x1000
 # CHECK-NEXT:    LOAD  0x000306 0x0000000000203306 0x0000000000203306 0x000001 0x001801 RW  0x1000
-# CHECK-NEXT:    LOAD  0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x000005 RW  0x1000
+# CHECK-NEXT:    LOAD  0x000b07 0x0000000000205b07 0x0000000000205b07 0x000003 0x001204 RW  0x1000
+
+# CHECK:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
+# CHECK-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
+# CHECK-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
+# CHECK-NEXT:  0000000000206d0b     0 NOTYPE  GLOBAL DEFAULT [[#]] (.lbss)   _end
 
 # CHECK1:      .data      PROGBITS        0000000000203306 000306 000001 00  WA  0   0  1
 # CHECK1-NEXT: .ldata     PROGBITS        0000000000203307 000307 000002 00 WAl  0   0  1
 # CHECK1-NEXT: .ldata2    PROGBITS        0000000000203309 000309 000001 00 WAl  0   0  1
 # CHECK1-NEXT: .comment   PROGBITS        0000000000000000 00030a {{.*}} 01  MS  0   0  1
 
+# CHECK1:       0000000000201304     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _start
+# CHECK1-NEXT:  0000000000201305     0 NOTYPE  GLOBAL DEFAULT [[#]] (.text)   _etext
+# CHECK1-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
+# CHECK1-NEXT:  000000000020330a     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
+
 # CHECK2:      .note      NOTE            0000000000200300 000300 000001 00   A  0   0  1
 # CHECK2-NEXT: .lrodata   PROGBITS        0000000000200301 000301 000001 00  Al  0   0  1
 ## With a SECTIONS command, we suppress the default rule placing .lrodata.* into .lrodata.
@@ -59,7 +69,7 @@
 # CHECK2-NEXT: .bss       NOBITS          0000000000200307 000307 001800 00  WA  0   0  1
 # CHECK2-NEXT: .ldata     PROGBITS        0000000000201b07 001b07 000002 00 WAl  0   0  1
 # CHECK2-NEXT: .ldata2    PROGBITS        0000000000201b09 001b09 000001 00 WAl  0   0  1
-# CHECK2-NEXT: .lbss      NOBITS          0000000000201b0a 001b0a 000002 00 WAl  0   0  1
+# CHECK2-NEXT: .lbss      NOBITS          0000000000201b0a 001b0a 001201 00 WAl  0   0  1
 # CHECK2-NEXT: .comment   PROGBITS        0000000000000000 001b0a {{.*}} 01  MS  0   0  1
 
 # CHECK2:      Program Headers:
@@ -67,11 +77,11 @@
 # CHECK2-NEXT:   PHDR  0x000040 0x0000000000200040 0x0000000000200040 {{.*}}   {{.*}}   R   0x8
 # CHECK2-NEXT:   LOAD  0x000000 0x0000000000200000 0x0000000000200000 0x000304 0x000304 R   0x1000
 # CHECK2-NEXT:   LOAD  0x000304 0x0000000000200304 0x0000000000200304 0x000001 0x000001 R E 0x1000
-# CHECK2-NEXT:   LOAD  0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x001807 RW  0x1000
+# CHECK2-NEXT:   LOAD  0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x002a06 RW  0x1000
 # CHECK2-NEXT:   TLS   0x000305 0x0000000000200305 0x0000000000200305 0x000001 0x000003 R   0x1
 
 #--- a.s
-.globl _start
+.globl _start, _etext, _edata, _end
 _start:
   ret
 
@@ -92,7 +102,7 @@ _start:
 ## Input .ldata.rel.ro sections are placed in the output .ldata section.
 .section .ldata.rel.ro,"awl"; .space 1
 .ifdef BSS
-.section .lbss,"awl",@nobits; .space 1
+.section .lbss,"awl",@nobits; .space 0x1200
 ## Input .lbss.rel.ro sections are placed in the output .lbss section.
 .section .lbss.rel.ro,"awl",@nobits; .space 1
 .endif
diff --git a/lld/test/MachO/invalid/invalid-lto-object-path.ll b/lld/test/MachO/invalid/invalid-lto-object-path.ll
index 75c6a97e446f..c862538d592c 100644
--- a/lld/test/MachO/invalid/invalid-lto-object-path.ll
+++ b/lld/test/MachO/invalid/invalid-lto-object-path.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 
 ;; Creating read-only directories with `chmod 400` isn't supported on Windows
 ; UNSUPPORTED: system-windows
diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll
index 47a612bd0a7b..88f766f59c88 100644
--- a/lld/test/MachO/thinlto-emit-imports.ll
+++ b/lld/test/MachO/thinlto-emit-imports.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86, non-root-user
 ; RUN: rm -rf %t; split-file %s %t
 
 ; Generate summary sections and test lld handling.
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 46b3b6d7a862..a4e9e5e1b19b 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -415,6 +415,13 @@ LTO_OPTS: -mllvm:-mcpu=x86-64 -opt:lldlto=2 -dwodir:foo -lto-cs-profile-generate
 RUN: ld.lld -### foo.o -m i386pep --lto-O2 --lto-CGO1 --lto-cs-profile-generate --lto-cs-profile-file=foo 2>&1 | FileCheck -check-prefix=LTO_OPTS2 %s
 LTO_OPTS2:-opt:lldlto=2 -opt:lldltocgo=1 -lto-cs-profile-generate -lto-cs-profile-file:foo
 
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+RUN: ld.lld -### foo.o -m i386pe --lto-emit-asm 2>&1 | FileCheck -check-prefix=LTO_EMIT_ASM %s
+LTO_EMIT_ASM: -lldemit:asm
+
+RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-llvm 2>&1 | FileCheck -check-prefix=LTO_EMIT_LLVM %s
+LTO_EMIT_LLVM: -lldemit:llvm
+
 Test GCC specific LTO options that GCC passes unconditionally, that we ignore.
 
 RUN: ld.lld -### foo.o -m i386pep -plugin /usr/lib/gcc/x86_64-w64-mingw32/10-posix/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-w64-mingw32/10-posix/lto-wrapper -plugin-opt=-fresolution=/tmp/ccM9d4fP.res -plugin-opt=-pass-through=-lmingw32 2> /dev/null
diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index b3e07f1f823c..d309c2ad4ee2 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -83,6 +83,7 @@ llvm_config.feature_config(
                 "PowerPC": "ppc",
                 "RISCV": "riscv",
                 "Sparc": "sparc",
+                "SystemZ": "systemz",
                 "WebAssembly": "wasm",
                 "X86": "x86",
             },
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 0ccfef32a601..55fe73c52f61 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -278,12 +278,12 @@ are commonly used on Windows.
   crash, rather than having to reproduce a failure or use a crash dump.
 * ``PYTHON_HOME`` (Required): Path to the folder where the Python distribution
   is installed. For example, ``C:\Python35``.
-* ``LLDB_RELOCATABLE_PYTHON`` (Default=0): When this is 0, LLDB will bind
+* ``LLDB_EMBED_PYTHON_HOME`` (Default=1 on Windows): When this is 1, LLDB will bind
   statically to the location specified in the ``PYTHON_HOME`` CMake variable,
   ignoring any value of ``PYTHONHOME`` set in the environment. This is most
   useful for developers who simply want to run LLDB after they build it. If you
   wish to move a build of LLDB to a different machine where Python will be in a
-  different location, setting ``LLDB_RELOCATABLE_PYTHON`` to 1 will cause
+  different location, setting ``LLDB_EMBED_PYTHON_HOME`` to 0 will cause
   Python to use its default mechanism for finding the python installation at
   runtime (looking for installed Pythons, or using the ``PYTHONHOME``
   environment variable if it is specified).
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 218113a7a391..62b2f91f5076 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -46,6 +46,7 @@ public:
       eBroadcastBitProgress = (1 << 0),
       eBroadcastBitWarning = (1 << 1),
       eBroadcastBitError = (1 << 2),
+      eBroadcastBitProgressCategory = (1 << 3),
   };
 
   SBDebugger();
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index c6d603ca5dcd..6ba90eb6ed8f 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -84,6 +84,7 @@ public:
     eBroadcastBitWarning = (1 << 1),
     eBroadcastBitError = (1 << 2),
     eBroadcastSymbolChange = (1 << 3),
+    eBroadcastBitProgressCategory = (1 << 4),
   };
 
   using DebuggerList = std::vector<lldb::DebuggerSP>;
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index d78f7c5ef3f7..43d931a84474 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -47,6 +47,26 @@ class UUID;
 class VariableList;
 struct ModuleFunctionSearchOptions;
 
+static constexpr OptionEnumValueElement g_auto_download_enum_values[] = {
+    {
+        lldb::eSymbolDownloadOff,
+        "off",
+        "Disable automatically downloading symbols.",
+    },
+    {
+        lldb::eSymbolDownloadBackground,
+        "background",
+        "Download symbols in the background for images as they appear in the "
+        "backtrace.",
+    },
+    {
+        lldb::eSymbolDownloadForeground,
+        "foreground",
+        "Download symbols in the foreground for images as they appear in the "
+        "backtrace.",
+    },
+};
+
 class ModuleListProperties : public Properties {
   mutable llvm::sys::RWMutex m_symlink_paths_mutex;
   PathMappingList m_symlink_paths;
@@ -60,7 +80,6 @@ public:
   bool SetClangModulesCachePath(const FileSpec &path);
   bool GetEnableExternalLookup() const;
   bool SetEnableExternalLookup(bool new_value);
-  bool GetEnableBackgroundLookup() const;
   bool GetEnableLLDBIndexCache() const;
   bool SetEnableLLDBIndexCache(bool new_value);
   uint64_t GetLLDBIndexCacheMaxByteSize();
@@ -71,6 +90,8 @@ public:
 
   bool GetLoadSymbolOnDemand();
 
+  lldb::SymbolDownload GetSymbolAutoDownload() const;
+
   PathMappingList GetSymlinkMappings() const;
 };
 
diff --git a/lldb/include/lldb/DataFormatters/FormatManager.h b/lldb/include/lldb/DataFormatters/FormatManager.h
index 986614f0c5e4..db2fe99c44ca 100644
--- a/lldb/include/lldb/DataFormatters/FormatManager.h
+++ b/lldb/include/lldb/DataFormatters/FormatManager.h
@@ -138,7 +138,7 @@ public:
   }
 
   static bool GetFormatFromCString(const char *format_cstr,
-                                   bool partial_match_ok, lldb::Format &format);
+                                   lldb::Format &format);
 
   static char GetFormatAsFormatChar(lldb::Format format);
 
diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
index 41be9b7efda8..23cc054b399a 100644
--- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h
+++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
@@ -49,14 +49,15 @@ public:
 
   virtual size_t GetIndexOfChildWithName(ConstString name) = 0;
 
-  // this function is assumed to always succeed and it if fails, the front-end
-  // should know to deal with it in the correct way (most probably, by refusing
-  // to return any children) the return value of Update() should actually be
-  // interpreted as "ValueObjectSyntheticFilter cache is good/bad" if =true,
-  // ValueObjectSyntheticFilter is allowed to use the children it fetched
-  // previously and cached if =false, ValueObjectSyntheticFilter must throw
-  // away its cache, and query again for children
-  virtual bool Update() = 0;
+  /// This function is assumed to always succeed and if it fails, the front-end
+  /// should know to deal with it in the correct way (most probably, by refusing
+  /// to return any children). The return value of \ref Update should actually
+  /// be interpreted as "ValueObjectSyntheticFilter cache is good/bad". If this
+  /// function returns \ref lldb::ChildCacheState::eReuse, \ref
+  /// ValueObjectSyntheticFilter is allowed to use the children it fetched
+  /// previously and cached. Otherwise, \ref ValueObjectSyntheticFilter must
+  /// throw away its cache, and query again for children.
+  virtual lldb::ChildCacheState Update() = 0;
 
   // if this function returns false, then CalculateNumChildren() MUST return 0
   // since UI frontends might validly decide not to inquire for children given
@@ -116,7 +117,9 @@ public:
     return UINT32_MAX;
   }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   bool MightHaveChildren() override { return false; }
 
@@ -328,7 +331,9 @@ public:
           filter->GetExpressionPathAtIndex(idx), true);
     }
 
-    bool Update() override { return false; }
+    lldb::ChildCacheState Update() override {
+      return lldb::ChildCacheState::eRefetch;
+    }
 
     bool MightHaveChildren() override { return filter->GetCount() > 0; }
 
@@ -427,7 +432,7 @@ public:
 
     lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-    bool Update() override;
+    lldb::ChildCacheState Update() override;
 
     bool MightHaveChildren() override;
 
diff --git a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
index 2b3936eaa707..fe46321c3186 100644
--- a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
+++ b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
@@ -21,10 +21,15 @@
 namespace lldb_private {
 
 class ValueObjectPrinter {
+  /// The ValueObjectPrinter is a one-shot printer for ValueObjects.  It
+  /// does not retain the ValueObject it is printing, that is the job of
+  /// its caller.  It also doesn't attempt to track changes in the
+  /// ValueObject, e.g. changing synthetic child providers or changing
+  /// dynamic vrs. static vrs. synthetic settings.
 public:
-  ValueObjectPrinter(ValueObject *valobj, Stream *s);
+  ValueObjectPrinter(ValueObject &valobj, Stream *s);
 
-  ValueObjectPrinter(ValueObject *valobj, Stream *s,
+  ValueObjectPrinter(ValueObject &valobj, Stream *s,
                      const DumpValueObjectOptions &options);
 
   ~ValueObjectPrinter() = default;
@@ -39,7 +44,7 @@ protected:
 
   // only this class (and subclasses, if any) should ever be concerned with the
   // depth mechanism
-  ValueObjectPrinter(ValueObject *valobj, Stream *s,
+  ValueObjectPrinter(ValueObject &valobj, Stream *s,
                      const DumpValueObjectOptions &options,
                      const DumpValueObjectOptions::PointerDepth &ptr_depth,
                      uint32_t curr_depth,
@@ -47,13 +52,29 @@ protected:
 
   // we should actually be using delegating constructors here but some versions
   // of GCC still have trouble with those
-  void Init(ValueObject *valobj, Stream *s,
+  void Init(ValueObject &valobj, Stream *s,
             const DumpValueObjectOptions &options,
             const DumpValueObjectOptions::PointerDepth &ptr_depth,
             uint32_t curr_depth,
             InstancePointersSetSP printed_instance_pointers);
 
-  bool GetMostSpecializedValue();
+  /// Cache the ValueObject we are actually going to print.  If this
+  /// ValueObject has a Dynamic type, we return that, if either the original
+  /// ValueObject or its Dynamic type has a Synthetic provider, return that.
+  /// This will never return an empty ValueObject, since we use the ValueObject
+  /// to carry errors.
+  /// Note, this gets called when making the printer object, and uses the
+  /// use dynamic and use synthetic settings of the ValueObject being printed,
+  /// so changes made to these settings won't affect already made
+  /// ValueObjectPrinters. SetupMostSpecializedValue();
+
+  /// Access the cached "most specialized value" - that is the one to use for
+  /// printing the value object's value.  However, be sure to use
+  /// GetValueForChildGeneration when you are generating the children of this
+  /// value.
+  ValueObject &GetMostSpecializedValue();
+
+  void SetupMostSpecializedValue();
 
   const char *GetDescriptionForDisplay();
 
@@ -95,13 +116,13 @@ protected:
 
   bool ShouldExpandEmptyAggregates();
 
-  ValueObject *GetValueObjectForChildrenGeneration();
+  ValueObject &GetValueObjectForChildrenGeneration();
 
   void PrintChildrenPreamble(bool value_printed, bool summary_printed);
 
   void PrintChildrenPostamble(bool print_dotdotdot);
 
-  lldb::ValueObjectSP GenerateChild(ValueObject *synth_valobj, size_t idx);
+  lldb::ValueObjectSP GenerateChild(ValueObject &synth_valobj, size_t idx);
 
   void PrintChild(lldb::ValueObjectSP child_sp,
                   const DumpValueObjectOptions::PointerDepth &curr_ptr_depth);
@@ -121,8 +142,10 @@ protected:
 private:
   bool ShouldShowName() const;
 
-  ValueObject *m_orig_valobj;
-  ValueObject *m_valobj;
+  ValueObject &m_orig_valobj;
+  ValueObject *m_cached_valobj; /// Cache the current "most specialized" value.
+                                /// Don't use this directly, use
+                                /// GetMostSpecializedValue.
   Stream *m_stream;
   DumpValueObjectOptions m_options;
   Flags m_type_flags;
diff --git a/lldb/include/lldb/DataFormatters/VectorIterator.h b/lldb/include/lldb/DataFormatters/VectorIterator.h
index 3414298f255b..5f774bb72c3a 100644
--- a/lldb/include/lldb/DataFormatters/VectorIterator.h
+++ b/lldb/include/lldb/DataFormatters/VectorIterator.h
@@ -28,7 +28,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 392d333c23a4..464053304783 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1305,6 +1305,21 @@ enum CompletionType {
   eTerminatorCompletion = (1ul << 27)
 };
 
+/// Specifies if children need to be re-computed
+/// after a call to \ref SyntheticChildrenFrontEnd::Update.
+enum class ChildCacheState {
+  eRefetch = 0, ///< Children need to be recomputed dynamically.
+
+  eReuse = 1, ///< Children did not change and don't need to be recomputed;
+              ///< re-use what we computed the last time we called Update.
+};
+
+enum SymbolDownload {
+  eSymbolDownloadOff = 0,
+  eSymbolDownloadBackground = 1,
+  eSymbolDownloadForeground = 2,
+};
+
 } // namespace lldb
 
 #endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 17a7e858b24e..a4d3fb66e8b5 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -177,7 +177,10 @@ protected:
 
     DumpValueObjectOptions options;
     options.SetDeclPrintingHelper(helper);
-    ValueObjectPrinter printer(valobj_sp.get(), &result.GetOutputStream(),
+    // We've already handled the case where the value object sp is null, so
+    // this is just to make sure future changes don't skip that:
+    assert(valobj_sp.get() && "Must have a valid ValueObject to print");
+    ValueObjectPrinter printer(*valobj_sp, &result.GetOutputStream(),
                                options);
     printer.PrintValueObject();
   }
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 8d81967bdb50..9c4aa2de3d7b 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -8,7 +8,12 @@ let Definition = "modulelist" in {
   def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">,
     Global,
     DefaultFalse,
-    Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">;
+    Desc<"Alias for backward compatibility: when enabled this is the equivalent to 'symbols.download background'.">;
+  def AutoDownload: Property<"auto-download", "Enum">,
+    Global,
+    DefaultEnumValue<"eSymbolDownloadOff">,
+    EnumValues<"OptionEnumValues(g_auto_download_enum_values)">,
+    Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">;
   def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
     Global,
     DefaultStringValue<"">,
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index 3c665c2eb213..fa5eadc6ff4e 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -2151,11 +2151,7 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry,
             if (entry.printf_format.find('%') == std::string::npos) {
               bool clear_printf = false;
 
-              if (FormatManager::GetFormatFromCString(
-                      entry.printf_format.c_str(), false, entry.fmt)) {
-                // We have an LLDB format, so clear the printf format
-                clear_printf = true;
-              } else if (entry.printf_format.size() == 1) {
+              if (entry.printf_format.size() == 1) {
                 switch (entry.printf_format[0]) {
                 case '@': // if this is an @ sign, print ObjC description
                   entry.number = ValueObject::
@@ -2198,20 +2194,20 @@ static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry,
                       eValueObjectRepresentationStyleExpressionPath;
                   clear_printf = true;
                   break;
-                default:
+                }
+              }
+
+              if (entry.number == 0) {
+                if (FormatManager::GetFormatFromCString(
+                        entry.printf_format.c_str(), entry.fmt)) {
+                  clear_printf = true;
+                } else if (entry.printf_format == "tid") {
+                  verify_is_thread_id = true;
+                } else {
                   error.SetErrorStringWithFormat("invalid format: '%s'",
                                                  entry.printf_format.c_str());
                   return error;
                 }
-              } else if (FormatManager::GetFormatFromCString(
-                             entry.printf_format.c_str(), true, entry.fmt)) {
-                clear_printf = true;
-              } else if (entry.printf_format == "tid") {
-                verify_is_thread_id = true;
-              } else {
-                error.SetErrorStringWithFormat("invalid format: '%s'",
-                                               entry.printf_format.c_str());
-                return error;
               }
 
               // Our format string turned out to not be a printf style format
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index b7f393636ba2..b03490bacf95 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -104,10 +104,15 @@ bool ModuleListProperties::SetEnableExternalLookup(bool new_value) {
   return SetPropertyAtIndex(ePropertyEnableExternalLookup, new_value);
 }
 
-bool ModuleListProperties::GetEnableBackgroundLookup() const {
-  const uint32_t idx = ePropertyEnableBackgroundLookup;
-  return GetPropertyAtIndexAs<bool>(
-      idx, g_modulelist_properties[idx].default_uint_value != 0);
+SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const {
+  // Backward compatibility alias.
+  if (GetPropertyAtIndexAs<bool>(ePropertyEnableBackgroundLookup, false))
+    return eSymbolDownloadBackground;
+
+  const uint32_t idx = ePropertyAutoDownload;
+  return GetPropertyAtIndexAs<lldb::SymbolDownload>(
+      idx, static_cast<lldb::SymbolDownload>(
+               g_modulelist_properties[idx].default_uint_value));
 }
 
 FileSpec ModuleListProperties::GetClangModulesCachePath() const {
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index 9208047be366..e80042826f7d 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -2525,7 +2525,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
 void ValueObject::Dump(Stream &s) { Dump(s, DumpValueObjectOptions(*this)); }
 
 void ValueObject::Dump(Stream &s, const DumpValueObjectOptions &options) {
-  ValueObjectPrinter printer(this, &s, options);
+  ValueObjectPrinter printer(*this, &s, options);
   printer.PrintValueObject();
 }
 
diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
index 43bc532c4a04..e8b4b02d11a0 100644
--- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp
+++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
@@ -43,7 +43,9 @@ public:
 
   bool MightHaveChildren() override { return m_backend.MightHaveChildren(); }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 };
 
 ValueObjectSynthetic::ValueObjectSynthetic(ValueObject &parent,
@@ -177,7 +179,7 @@ bool ValueObjectSynthetic::UpdateValue() {
   }
 
   // let our backend do its update
-  if (!m_synth_filter_up->Update()) {
+  if (m_synth_filter_up->Update() == lldb::ChildCacheState::eRefetch) {
     LLDB_LOGF(log,
               "[ValueObjectSynthetic::UpdateValue] name=%s, synthetic "
               "filter said caches are stale - clearing",
diff --git a/lldb/source/DataFormatters/FormatManager.cpp b/lldb/source/DataFormatters/FormatManager.cpp
index f1f135de32ca..092fa3c8ce49 100644
--- a/lldb/source/DataFormatters/FormatManager.cpp
+++ b/lldb/source/DataFormatters/FormatManager.cpp
@@ -91,7 +91,7 @@ static bool GetFormatFromFormatChar(char format_char, Format &format) {
 }
 
 static bool GetFormatFromFormatName(llvm::StringRef format_name,
-                                    bool partial_match_ok, Format &format) {
+                                    Format &format) {
   uint32_t i;
   for (i = 0; i < g_num_format_infos; ++i) {
     if (format_name.equals_insensitive(g_format_infos[i].format_name)) {
@@ -100,13 +100,11 @@ static bool GetFormatFromFormatName(llvm::StringRef format_name,
     }
   }
 
-  if (partial_match_ok) {
-    for (i = 0; i < g_num_format_infos; ++i) {
-      if (llvm::StringRef(g_format_infos[i].format_name)
-              .starts_with_insensitive(format_name)) {
-        format = g_format_infos[i].format;
-        return true;
-      }
+  for (i = 0; i < g_num_format_infos; ++i) {
+    if (llvm::StringRef(g_format_infos[i].format_name)
+            .starts_with_insensitive(format_name)) {
+      format = g_format_infos[i].format;
+      return true;
     }
   }
   format = eFormatInvalid;
@@ -124,7 +122,6 @@ void FormatManager::Changed() {
 }
 
 bool FormatManager::GetFormatFromCString(const char *format_cstr,
-                                         bool partial_match_ok,
                                          lldb::Format &format) {
   bool success = false;
   if (format_cstr && format_cstr[0]) {
@@ -134,7 +131,7 @@ bool FormatManager::GetFormatFromCString(const char *format_cstr,
         return true;
     }
 
-    success = GetFormatFromFormatName(format_cstr, partial_match_ok, format);
+    success = GetFormatFromFormatName(format_cstr, format);
   }
   if (!success)
     format = eFormatInvalid;
diff --git a/lldb/source/DataFormatters/TypeSummary.cpp b/lldb/source/DataFormatters/TypeSummary.cpp
index c09ed31d0338..3707d2d879d3 100644
--- a/lldb/source/DataFormatters/TypeSummary.cpp
+++ b/lldb/source/DataFormatters/TypeSummary.cpp
@@ -80,7 +80,10 @@ bool StringSummaryFormat::FormatObject(ValueObject *valobj, std::string &retval,
     sc = frame->GetSymbolContext(lldb::eSymbolContextEverything);
 
   if (IsOneLiner()) {
-    ValueObjectPrinter printer(valobj, &s, DumpValueObjectOptions());
+    // We've already checked the case of a NULL valobj above.  Let's put in an
+    // assert here to make sure someone doesn't take that out:
+    assert(valobj && "Must have a valid ValueObject to summarize");
+    ValueObjectPrinter printer(*valobj, &s, DumpValueObjectOptions());
     printer.PrintChildrenOneLiner(HideNames(valobj));
     retval = std::string(s.GetString());
     return true;
diff --git a/lldb/source/DataFormatters/TypeSynthetic.cpp b/lldb/source/DataFormatters/TypeSynthetic.cpp
index de042e474903..8a6f132a3957 100644
--- a/lldb/source/DataFormatters/TypeSynthetic.cpp
+++ b/lldb/source/DataFormatters/TypeSynthetic.cpp
@@ -190,11 +190,13 @@ size_t ScriptedSyntheticChildren::FrontEnd::CalculateNumChildren(uint32_t max) {
   return m_interpreter->CalculateNumChildren(m_wrapper_sp, max);
 }
 
-bool ScriptedSyntheticChildren::FrontEnd::Update() {
+lldb::ChildCacheState ScriptedSyntheticChildren::FrontEnd::Update() {
   if (!m_wrapper_sp || m_interpreter == nullptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
-  return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp);
+  return m_interpreter->UpdateSynthProviderInstance(m_wrapper_sp)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool ScriptedSyntheticChildren::FrontEnd::MightHaveChildren() {
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index 074d0b648e6f..46e50a8d421a 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -18,39 +18,35 @@
 using namespace lldb;
 using namespace lldb_private;
 
-ValueObjectPrinter::ValueObjectPrinter(ValueObject *valobj, Stream *s) {
-  if (valobj) {
-    DumpValueObjectOptions options(*valobj);
-    Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
-  } else {
-    DumpValueObjectOptions options;
-    Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
-  }
+ValueObjectPrinter::ValueObjectPrinter(ValueObject &valobj, Stream *s)
+    : m_orig_valobj(valobj) {
+  DumpValueObjectOptions options(valobj);
+  Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
 }
 
-ValueObjectPrinter::ValueObjectPrinter(ValueObject *valobj, Stream *s,
-                                       const DumpValueObjectOptions &options) {
+ValueObjectPrinter::ValueObjectPrinter(ValueObject &valobj, Stream *s,
+                                       const DumpValueObjectOptions &options)
+    : m_orig_valobj(valobj) {
   Init(valobj, s, options, m_options.m_max_ptr_depth, 0, nullptr);
 }
 
 ValueObjectPrinter::ValueObjectPrinter(
-    ValueObject *valobj, Stream *s, const DumpValueObjectOptions &options,
+    ValueObject &valobj, Stream *s, const DumpValueObjectOptions &options,
     const DumpValueObjectOptions::PointerDepth &ptr_depth, uint32_t curr_depth,
-    InstancePointersSetSP printed_instance_pointers) {
+    InstancePointersSetSP printed_instance_pointers)
+    : m_orig_valobj(valobj) {
   Init(valobj, s, options, ptr_depth, curr_depth, printed_instance_pointers);
 }
 
 void ValueObjectPrinter::Init(
-    ValueObject *valobj, Stream *s, const DumpValueObjectOptions &options,
+    ValueObject &valobj, Stream *s, const DumpValueObjectOptions &options,
     const DumpValueObjectOptions::PointerDepth &ptr_depth, uint32_t curr_depth,
     InstancePointersSetSP printed_instance_pointers) {
-  m_orig_valobj = valobj;
-  m_valobj = nullptr;
+  m_cached_valobj = nullptr;
   m_stream = s;
   m_options = options;
   m_ptr_depth = ptr_depth;
   m_curr_depth = curr_depth;
-  assert(m_orig_valobj && "cannot print a NULL ValueObject");
   assert(m_stream && "cannot print to a NULL Stream");
   m_should_print = eLazyBoolCalculate;
   m_is_nil = eLazyBoolCalculate;
@@ -68,23 +64,18 @@ void ValueObjectPrinter::Init(
       printed_instance_pointers
           ? printed_instance_pointers
           : InstancePointersSetSP(new InstancePointersSet());
+  SetupMostSpecializedValue();
 }
 
 bool ValueObjectPrinter::PrintValueObject() {
-  if (!m_orig_valobj)
-    return false;
-
   // If the incoming ValueObject is in an error state, the best we're going to 
   // get out of it is its type.  But if we don't even have that, just print
   // the error and exit early.
-  if (m_orig_valobj->GetError().Fail() 
-      && !m_orig_valobj->GetCompilerType().IsValid()) {
-    m_stream->Printf("Error: '%s'", m_orig_valobj->GetError().AsCString());
+  if (m_orig_valobj.GetError().Fail() &&
+      !m_orig_valobj.GetCompilerType().IsValid()) {
+    m_stream->Printf("Error: '%s'", m_orig_valobj.GetError().AsCString());
     return true;
   }
-   
-  if (!GetMostSpecializedValue() || m_valobj == nullptr)
-    return false;
 
   if (ShouldPrintValueObject()) {
     PrintLocationIfNeeded();
@@ -107,66 +98,68 @@ bool ValueObjectPrinter::PrintValueObject() {
   return true;
 }
 
-bool ValueObjectPrinter::GetMostSpecializedValue() {
-  if (m_valobj)
-    return true;
-  bool update_success = m_orig_valobj->UpdateValueIfNeeded(true);
-  if (!update_success) {
-    m_valobj = m_orig_valobj;
-  } else {
-    if (m_orig_valobj->IsDynamic()) {
+ValueObject &ValueObjectPrinter::GetMostSpecializedValue() {
+  assert(m_cached_valobj && "ValueObjectPrinter must have a valid ValueObject");
+  return *m_cached_valobj;
+}
+
+void ValueObjectPrinter::SetupMostSpecializedValue() {
+  bool update_success = m_orig_valobj.UpdateValueIfNeeded(true);
+  // If we can't find anything better, we'll fall back on the original
+  // ValueObject.
+  m_cached_valobj = &m_orig_valobj;
+  if (update_success) {
+    if (m_orig_valobj.IsDynamic()) {
       if (m_options.m_use_dynamic == eNoDynamicValues) {
-        ValueObject *static_value = m_orig_valobj->GetStaticValue().get();
+        ValueObject *static_value = m_orig_valobj.GetStaticValue().get();
         if (static_value)
-          m_valobj = static_value;
-        else
-          m_valobj = m_orig_valobj;
-      } else
-        m_valobj = m_orig_valobj;
+          m_cached_valobj = static_value;
+      }
     } else {
       if (m_options.m_use_dynamic != eNoDynamicValues) {
         ValueObject *dynamic_value =
-            m_orig_valobj->GetDynamicValue(m_options.m_use_dynamic).get();
+            m_orig_valobj.GetDynamicValue(m_options.m_use_dynamic).get();
         if (dynamic_value)
-          m_valobj = dynamic_value;
-        else
-          m_valobj = m_orig_valobj;
-      } else
-        m_valobj = m_orig_valobj;
+          m_cached_valobj = dynamic_value;
+      }
     }
 
-    if (m_valobj->IsSynthetic()) {
+    if (m_cached_valobj->IsSynthetic()) {
       if (!m_options.m_use_synthetic) {
-        ValueObject *non_synthetic = m_valobj->GetNonSyntheticValue().get();
+        ValueObject *non_synthetic =
+            m_cached_valobj->GetNonSyntheticValue().get();
         if (non_synthetic)
-          m_valobj = non_synthetic;
+          m_cached_valobj = non_synthetic;
       }
     } else {
       if (m_options.m_use_synthetic) {
-        ValueObject *synthetic = m_valobj->GetSyntheticValue().get();
+        ValueObject *synthetic = m_cached_valobj->GetSyntheticValue().get();
         if (synthetic)
-          m_valobj = synthetic;
+          m_cached_valobj = synthetic;
       }
     }
   }
-  m_compiler_type = m_valobj->GetCompilerType();
+  m_compiler_type = m_cached_valobj->GetCompilerType();
   m_type_flags = m_compiler_type.GetTypeInfo();
-  return true;
+  assert(m_cached_valobj &&
+         "SetupMostSpecialized value must compute a valid ValueObject");
 }
 
 const char *ValueObjectPrinter::GetDescriptionForDisplay() {
-  const char *str = m_valobj->GetObjectDescription();
+  ValueObject &valobj = GetMostSpecializedValue();
+  const char *str = valobj.GetObjectDescription();
   if (!str)
-    str = m_valobj->GetSummaryAsCString();
+    str = valobj.GetSummaryAsCString();
   if (!str)
-    str = m_valobj->GetValueAsCString();
+    str = valobj.GetValueAsCString();
   return str;
 }
 
 const char *ValueObjectPrinter::GetRootNameForDisplay() {
-  const char *root_valobj_name = m_options.m_root_valobj_name.empty()
-                                     ? m_valobj->GetName().AsCString()
-                                     : m_options.m_root_valobj_name.c_str();
+  const char *root_valobj_name =
+      m_options.m_root_valobj_name.empty()
+          ? GetMostSpecializedValue().GetName().AsCString()
+          : m_options.m_root_valobj_name.c_str();
   return root_valobj_name ? root_valobj_name : "";
 }
 
@@ -181,14 +174,16 @@ bool ValueObjectPrinter::ShouldPrintValueObject() {
 
 bool ValueObjectPrinter::IsNil() {
   if (m_is_nil == eLazyBoolCalculate)
-    m_is_nil = m_valobj->IsNilReference() ? eLazyBoolYes : eLazyBoolNo;
+    m_is_nil =
+        GetMostSpecializedValue().IsNilReference() ? eLazyBoolYes : eLazyBoolNo;
   return m_is_nil == eLazyBoolYes;
 }
 
 bool ValueObjectPrinter::IsUninitialized() {
   if (m_is_uninit == eLazyBoolCalculate)
-    m_is_uninit =
-        m_valobj->IsUninitializedReference() ? eLazyBoolYes : eLazyBoolNo;
+    m_is_uninit = GetMostSpecializedValue().IsUninitializedReference()
+                      ? eLazyBoolYes
+                      : eLazyBoolNo;
   return m_is_uninit == eLazyBoolYes;
 }
 
@@ -213,19 +208,20 @@ bool ValueObjectPrinter::IsAggregate() {
 
 bool ValueObjectPrinter::IsInstancePointer() {
   // you need to do this check on the value's clang type
+  ValueObject &valobj = GetMostSpecializedValue();
   if (m_is_instance_ptr == eLazyBoolCalculate)
-    m_is_instance_ptr = (m_valobj->GetValue().GetCompilerType().GetTypeInfo() &
+    m_is_instance_ptr = (valobj.GetValue().GetCompilerType().GetTypeInfo() &
                          eTypeInstanceIsPointer) != 0
                             ? eLazyBoolYes
                             : eLazyBoolNo;
-  if ((eLazyBoolYes == m_is_instance_ptr) && m_valobj->IsBaseClass())
+  if ((eLazyBoolYes == m_is_instance_ptr) && valobj.IsBaseClass())
     m_is_instance_ptr = eLazyBoolNo;
   return m_is_instance_ptr == eLazyBoolYes;
 }
 
 bool ValueObjectPrinter::PrintLocationIfNeeded() {
   if (m_options.m_show_location) {
-    m_stream->Printf("%s: ", m_valobj->GetLocationAsCString());
+    m_stream->Printf("%s: ", GetMostSpecializedValue().GetLocationAsCString());
     return true;
   }
   return false;
@@ -244,6 +240,8 @@ void ValueObjectPrinter::PrintDecl() {
                 (m_curr_depth == 0 && !m_options.m_flat_output);
 
   StreamString typeName;
+  // Figure out which ValueObject we're acting on
+  ValueObject &valobj = GetMostSpecializedValue();
 
   // always show the type at the root level if it is invalid
   if (show_type) {
@@ -252,8 +250,8 @@ void ValueObjectPrinter::PrintDecl() {
     ConstString type_name;
     if (m_compiler_type.IsValid()) {
       type_name = m_options.m_use_type_display_name
-                      ? m_valobj->GetDisplayTypeName()
-                      : m_valobj->GetQualifiedTypeName();
+                      ? valobj.GetDisplayTypeName()
+                      : valobj.GetQualifiedTypeName();
     } else {
       // only show an invalid type name if the user explicitly triggered
       // show_type
@@ -277,7 +275,7 @@ void ValueObjectPrinter::PrintDecl() {
 
   if (ShouldShowName()) {
     if (m_options.m_flat_output)
-      m_valobj->GetExpressionPath(varName);
+      valobj.GetExpressionPath(varName);
     else
       varName << GetRootNameForDisplay();
   }
@@ -289,7 +287,7 @@ void ValueObjectPrinter::PrintDecl() {
     // one for the ValueObject
     lldb::LanguageType lang_type =
         (m_options.m_varformat_language == lldb::eLanguageTypeUnknown)
-            ? m_valobj->GetPreferredDisplayLanguage()
+            ? valobj.GetPreferredDisplayLanguage()
             : m_options.m_varformat_language;
     if (Language *lang_plugin = Language::FindPlugin(lang_type)) {
       m_options.m_decl_printing_helper = lang_plugin->GetDeclPrintingHelper();
@@ -327,14 +325,15 @@ void ValueObjectPrinter::PrintDecl() {
 bool ValueObjectPrinter::CheckScopeIfNeeded() {
   if (m_options.m_scope_already_checked)
     return true;
-  return m_valobj->IsInScope();
+  return GetMostSpecializedValue().IsInScope();
 }
 
 TypeSummaryImpl *ValueObjectPrinter::GetSummaryFormatter(bool null_if_omitted) {
   if (!m_summary_formatter.second) {
-    TypeSummaryImpl *entry = m_options.m_summary_sp
-                                 ? m_options.m_summary_sp.get()
-                                 : m_valobj->GetSummaryFormat().get();
+    TypeSummaryImpl *entry =
+        m_options.m_summary_sp
+            ? m_options.m_summary_sp.get()
+            : GetMostSpecializedValue().GetSummaryFormat().get();
 
     if (m_options.m_omit_summary_depth > 0)
       entry = nullptr;
@@ -357,18 +356,19 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
                                               std::string &summary,
                                               std::string &error) {
   lldb::Format format = m_options.m_format;
+  ValueObject &valobj = GetMostSpecializedValue();
   // if I am printing synthetized elements, apply the format to those elements
   // only
   if (m_options.m_pointer_as_array)
-    m_valobj->GetValueAsCString(lldb::eFormatDefault, value);
-  else if (format != eFormatDefault && format != m_valobj->GetFormat())
-    m_valobj->GetValueAsCString(format, value);
+    valobj.GetValueAsCString(lldb::eFormatDefault, value);
+  else if (format != eFormatDefault && format != valobj.GetFormat())
+    valobj.GetValueAsCString(format, value);
   else {
-    const char *val_cstr = m_valobj->GetValueAsCString();
+    const char *val_cstr = valobj.GetValueAsCString();
     if (val_cstr)
       value.assign(val_cstr);
   }
-  const char *err_cstr = m_valobj->GetError().AsCString();
+  const char *err_cstr = valobj.GetError().AsCString();
   if (err_cstr)
     error.assign(err_cstr);
 
@@ -378,7 +378,7 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
   if (IsNil()) {
     lldb::LanguageType lang_type =
         (m_options.m_varformat_language == lldb::eLanguageTypeUnknown)
-            ? m_valobj->GetPreferredDisplayLanguage()
+            ? valobj.GetPreferredDisplayLanguage()
             : m_options.m_varformat_language;
     if (Language *lang_plugin = Language::FindPlugin(lang_type)) {
       summary.assign(lang_plugin->GetNilReferenceSummaryString().str());
@@ -392,11 +392,11 @@ void ValueObjectPrinter::GetValueSummaryError(std::string &value,
   } else if (m_options.m_omit_summary_depth == 0) {
     TypeSummaryImpl *entry = GetSummaryFormatter();
     if (entry) {
-      m_valobj->GetSummaryAsCString(entry, summary,
-                                    m_options.m_varformat_language);
+      valobj.GetSummaryAsCString(entry, summary,
+                                 m_options.m_varformat_language);
     } else {
       const char *sum_cstr =
-          m_valobj->GetSummaryAsCString(m_options.m_varformat_language);
+          valobj.GetSummaryAsCString(m_options.m_varformat_language);
       if (sum_cstr)
         summary.assign(sum_cstr);
     }
@@ -431,16 +431,17 @@ bool ValueObjectPrinter::PrintValueAndSummaryIfNeeded(bool &value_printed,
       // this thing is nil (but show the value if the user passes a format
       // explicitly)
       TypeSummaryImpl *entry = GetSummaryFormatter();
+      ValueObject &valobj = GetMostSpecializedValue();
       const bool has_nil_or_uninitialized_summary =
           (IsNil() || IsUninitialized()) && !m_summary.empty();
       if (!has_nil_or_uninitialized_summary && !m_value.empty() &&
           (entry == nullptr ||
-           (entry->DoesPrintValue(m_valobj) ||
+           (entry->DoesPrintValue(&valobj) ||
             m_options.m_format != eFormatDefault) ||
            m_summary.empty()) &&
           !m_options.m_hide_value) {
         if (m_options.m_hide_pointer_value &&
-            IsPointerValue(m_valobj->GetCompilerType())) {
+            IsPointerValue(valobj.GetCompilerType())) {
         } else {
           if (ShouldShowName())
             m_stream->PutChar(' ');
@@ -470,7 +471,7 @@ bool ValueObjectPrinter::PrintObjectDescriptionIfNeeded(bool value_printed,
         m_stream->Printf(" ");
       const char *object_desc = nullptr;
       if (value_printed || summary_printed)
-        object_desc = m_valobj->GetObjectDescription();
+        object_desc = GetMostSpecializedValue().GetObjectDescription();
       else
         object_desc = GetDescriptionForDisplay();
       if (object_desc && *object_desc) {
@@ -523,8 +524,9 @@ bool ValueObjectPrinter::ShouldPrintChildren(
     return false;
 
   bool print_children = true;
+  ValueObject &valobj = GetMostSpecializedValue();
   if (TypeSummaryImpl *type_summary = GetSummaryFormatter())
-    print_children = type_summary->DoesPrintChildren(m_valobj);
+    print_children = type_summary->DoesPrintChildren(&valobj);
 
   // We will show children for all concrete types. We won't show pointer
   // contents unless a pointer depth has been specified. We won't reference
@@ -537,7 +539,7 @@ bool ValueObjectPrinter::ShouldPrintChildren(
     // We have a pointer or reference whose value is an address. Make sure
     // that address is not NULL
     AddressType ptr_address_type;
-    if (m_valobj->GetPointerValue(&ptr_address_type) == 0)
+    if (valobj.GetPointerValue(&ptr_address_type) == 0)
       return false;
 
     const bool is_root_level = m_curr_depth == 0;
@@ -565,8 +567,8 @@ bool ValueObjectPrinter::ShouldExpandEmptyAggregates() {
   return entry->DoesPrintEmptyAggregates();
 }
 
-ValueObject *ValueObjectPrinter::GetValueObjectForChildrenGeneration() {
-  return m_valobj;
+ValueObject &ValueObjectPrinter::GetValueObjectForChildrenGeneration() {
+  return GetMostSpecializedValue();
 }
 
 void ValueObjectPrinter::PrintChildrenPreamble(bool value_printed,
@@ -612,7 +614,7 @@ void ValueObjectPrinter::PrintChild(
     if (does_consume_ptr_depth)
       ptr_depth = curr_ptr_depth.Decremented();
 
-    ValueObjectPrinter child_printer(child_sp.get(), m_stream, child_options,
+    ValueObjectPrinter child_printer(*(child_sp.get()), m_stream, child_options,
                                      ptr_depth, m_curr_depth + 1,
                                      m_printed_instance_pointers);
     child_printer.PrintValueObject();
@@ -620,16 +622,17 @@ void ValueObjectPrinter::PrintChild(
 }
 
 uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (m_options.m_pointer_as_array)
     return m_options.m_pointer_as_array.m_element_count;
 
-  size_t num_children = synth_m_valobj->GetNumChildren();
+  size_t num_children = synth_valobj.GetNumChildren();
   print_dotdotdot = false;
   if (num_children) {
-    const size_t max_num_children =
-        m_valobj->GetTargetSP()->GetMaximumNumberOfChildrenToDisplay();
+    const size_t max_num_children = GetMostSpecializedValue()
+                                        .GetTargetSP()
+                                        ->GetMaximumNumberOfChildrenToDisplay();
 
     if (num_children > max_num_children && !m_options.m_ignore_cap) {
       print_dotdotdot = true;
@@ -642,7 +645,8 @@ uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
 void ValueObjectPrinter::PrintChildrenPostamble(bool print_dotdotdot) {
   if (!m_options.m_flat_output) {
     if (print_dotdotdot) {
-      m_valobj->GetTargetSP()
+      GetMostSpecializedValue()
+          .GetTargetSP()
           ->GetDebugger()
           .GetCommandInterpreter()
           .ChildrenTruncated();
@@ -655,7 +659,7 @@ void ValueObjectPrinter::PrintChildrenPostamble(bool print_dotdotdot) {
 
 bool ValueObjectPrinter::ShouldPrintEmptyBrackets(bool value_printed,
                                                   bool summary_printed) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (!IsAggregate())
     return false;
@@ -665,7 +669,7 @@ bool ValueObjectPrinter::ShouldPrintEmptyBrackets(bool value_printed,
       return false;
   }
 
-  if (synth_m_valobj->MightHaveChildren())
+  if (synth_valobj.MightHaveChildren())
     return true;
 
   if (m_val_summary_ok)
@@ -679,25 +683,25 @@ static constexpr size_t PhysicalIndexForLogicalIndex(size_t base, size_t stride,
   return base + logical * stride;
 }
 
-ValueObjectSP ValueObjectPrinter::GenerateChild(ValueObject *synth_valobj,
+ValueObjectSP ValueObjectPrinter::GenerateChild(ValueObject &synth_valobj,
                                                 size_t idx) {
   if (m_options.m_pointer_as_array) {
     // if generating pointer-as-array children, use GetSyntheticArrayMember
-    return synth_valobj->GetSyntheticArrayMember(
+    return synth_valobj.GetSyntheticArrayMember(
         PhysicalIndexForLogicalIndex(
             m_options.m_pointer_as_array.m_base_element,
             m_options.m_pointer_as_array.m_stride, idx),
         true);
   } else {
     // otherwise, do the usual thing
-    return synth_valobj->GetChildAtIndex(idx);
+    return synth_valobj.GetChildAtIndex(idx);
   }
 }
 
 void ValueObjectPrinter::PrintChildren(
     bool value_printed, bool summary_printed,
     const DumpValueObjectOptions::PointerDepth &curr_ptr_depth) {
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
   size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
@@ -705,7 +709,7 @@ void ValueObjectPrinter::PrintChildren(
     bool any_children_printed = false;
 
     for (size_t idx = 0; idx < num_children; ++idx) {
-      if (ValueObjectSP child_sp = GenerateChild(synth_m_valobj, idx)) {
+      if (ValueObjectSP child_sp = GenerateChild(synth_valobj, idx)) {
         if (m_options.m_child_printing_decider &&
             !m_options.m_child_printing_decider(child_sp->GetName()))
           continue;
@@ -733,7 +737,7 @@ void ValueObjectPrinter::PrintChildren(
     if (ShouldPrintValueObject()) {
       // if it has a synthetic value, then don't print {}, the synthetic
       // children are probably only being used to vend a value
-      if (m_valobj->DoesProvideSyntheticValue() ||
+      if (GetMostSpecializedValue().DoesProvideSyntheticValue() ||
           !ShouldExpandEmptyAggregates())
         m_stream->PutCString("\n");
       else
@@ -746,10 +750,7 @@ void ValueObjectPrinter::PrintChildren(
 }
 
 bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
-  if (!GetMostSpecializedValue() || m_valobj == nullptr)
-    return false;
-
-  ValueObject *synth_m_valobj = GetValueObjectForChildrenGeneration();
+  ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
   size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
@@ -759,7 +760,7 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
 
     bool did_print_children = false;
     for (uint32_t idx = 0; idx < num_children; ++idx) {
-      lldb::ValueObjectSP child_sp(synth_m_valobj->GetChildAtIndex(idx));
+      lldb::ValueObjectSP child_sp(synth_valobj.GetChildAtIndex(idx));
       if (child_sp)
         child_sp = child_sp->GetQualifiedRepresentationIfAvailable(
             m_options.m_use_dynamic, m_options.m_use_synthetic);
@@ -795,6 +796,7 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
 void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
                                                bool summary_printed) {
   PrintObjectDescriptionIfNeeded(value_printed, summary_printed);
+  ValueObject &valobj = GetMostSpecializedValue();
 
   DumpValueObjectOptions::PointerDepth curr_ptr_depth = m_ptr_depth;
   const bool print_children = ShouldPrintChildren(curr_ptr_depth);
@@ -803,9 +805,9 @@ void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
        !m_options.m_allow_oneliner_mode || m_options.m_flat_output ||
        (m_options.m_pointer_as_array) || m_options.m_show_location)
           ? false
-          : DataVisualization::ShouldPrintAsOneLiner(*m_valobj);
+          : DataVisualization::ShouldPrintAsOneLiner(valobj);
   if (print_children && IsInstancePointer()) {
-    uint64_t instance_ptr_value = m_valobj->GetValueAsUnsigned(0);
+    uint64_t instance_ptr_value = valobj.GetValueAsUnsigned(0);
     if (m_printed_instance_pointers->count(instance_ptr_value)) {
       // We already printed this instance-is-pointer thing, so don't expand it.
       m_stream->PutCString(" {...}\n");
@@ -831,7 +833,7 @@ void ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
     // the user. The warning tells the user that the limit has been reached, but
     // more importantly tells them how to expand the limit if desired.
     if (m_options.m_max_depth_is_default)
-      m_valobj->GetTargetSP()
+      valobj.GetTargetSP()
           ->GetDebugger()
           .GetCommandInterpreter()
           .SetReachedMaximumDepth();
diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp
index 57dae0b2c71f..c94ca68319ee 100644
--- a/lldb/source/DataFormatters/VectorType.cpp
+++ b/lldb/source/DataFormatters/VectorType.cpp
@@ -245,7 +245,7 @@ public:
     return child_sp;
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_parent_format = m_backend.GetFormat();
     CompilerType parent_type(m_backend.GetCompilerType());
     CompilerType element_type;
@@ -258,7 +258,7 @@ public:
         ::CalculateNumChildren(element_type, num_elements, m_child_type)
             .value_or(0);
     m_item_format = GetItemFormatForFormat(m_parent_format, m_child_type);
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Interpreter/OptionArgParser.cpp b/lldb/source/Interpreter/OptionArgParser.cpp
index d13805a75ffb..75ccad87467e 100644
--- a/lldb/source/Interpreter/OptionArgParser.cpp
+++ b/lldb/source/Interpreter/OptionArgParser.cpp
@@ -93,8 +93,7 @@ Status OptionArgParser::ToFormat(const char *s, lldb::Format &format,
         *byte_size_ptr = 0;
     }
 
-    const bool partial_match_ok = true;
-    if (!FormatManager::GetFormatFromCString(s, partial_match_ok, format)) {
+    if (!FormatManager::GetFormatFromCString(s, format)) {
       StreamString error_strm;
       error_strm.Printf(
           "Invalid format character or name '%s'. Valid values are:\n", s);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index 314a4aca8d26..2e43aa3fa1d8 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -136,7 +136,9 @@ public:
 
   // return true if this object is now safe to use forever without ever
   // updating again; the typical (and tested) answer here is 'false'
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   // maybe return false if the block pointer is, say, null
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 21108b27896a..97fa894ea737 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -17,6 +17,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibCxxTuple.cpp
   LibCxxUnorderedMap.cpp
   LibCxxVariant.cpp
+  LibCxxValarray.cpp
   LibCxxVector.cpp
   LibStdcpp.cpp
   LibStdcppTuple.cpp
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 1dcda53350af..675ca3851861 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -752,6 +752,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       "^std::__[[:alnum:]]+::vector<.+>$", stl_deref_flags, true);
   AddCXXSynthetic(
       cpp_category_sp,
+      lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator,
+      "libc++ std::valarray synthetic children",
+      "^std::__[[:alnum:]]+::valarray<.+>$", stl_deref_flags, true);
+  AddCXXSynthetic(
+      cpp_category_sp,
       lldb_private::formatters::LibcxxStdForwardListSyntheticFrontEndCreator,
       "libc++ std::forward_list synthetic children",
       "^std::__[[:alnum:]]+::forward_list<.+>$", stl_synth_flags, true);
@@ -871,6 +876,10 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxContainerSummaryProvider,
                 "libc++ std::vector summary provider",
                 "^std::__[[:alnum:]]+::vector<.+>$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxContainerSummaryProvider,
+                "libc++ std::valarray summary provider",
+                "^std::__[[:alnum:]]+::valarray<.+>$", stl_summary_flags, true);
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::LibcxxContainerSummaryProvider,
       "libc++ std::list summary provider",
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
index 6aeae97667c1..742017438bcf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
@@ -125,24 +125,24 @@ lldb::ValueObjectSP lldb_private::formatters::
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() {
   m_resume_ptr_sp.reset();
   m_destroy_ptr_sp.reset();
   m_promise_ptr_sp.reset();
 
   ValueObjectSP valobj_sp = m_backend.GetNonSyntheticValue();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb::addr_t frame_ptr_addr = GetCoroFramePtrFromHandle(valobj_sp);
   if (frame_ptr_addr == 0 || frame_ptr_addr == LLDB_INVALID_ADDRESS)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto ts = valobj_sp->GetCompilerType().GetTypeSystem();
   auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
   if (!ast_ctx)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Create the `resume` and `destroy` children.
   lldb::TargetSP target_sp = m_backend.GetTargetSP();
@@ -165,7 +165,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   CompilerType promise_type(
       valobj_sp->GetCompilerType().GetTypeTemplateArgument(0));
   if (!promise_type)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Try to infer the promise_type if it was type-erased
   if (promise_type.IsVoidType()) {
@@ -180,7 +180,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   // If we don't know the promise type, we don't display the `promise` member.
   // `CreateValueObjectFromAddress` below would fail for `void` types.
   if (promise_type.IsVoidType()) {
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   // Add the `promise` member. We intentionally add `promise` as a pointer type
@@ -194,7 +194,7 @@ bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
   if (error.Success())
     m_promise_ptr_sp = promisePtr->Clone(ConstString("promise"));
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
index b26cc9ed6132..d38c7ecefa6e 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
@@ -38,7 +38,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
index 2876efc5c41a..ac3166385235 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp
@@ -33,7 +33,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_elements.size(); }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -78,13 +78,13 @@ llvm::StringRef GenericBitsetFrontEnd::GetDataContainerMemberName() {
   llvm_unreachable("Unknown StdLib enum");
 }
 
-bool GenericBitsetFrontEnd::Update() {
+lldb::ChildCacheState GenericBitsetFrontEnd::Update() {
   m_elements.clear();
   m_first = nullptr;
 
   TargetSP target_sp = m_backend.GetTargetSP();
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   size_t size = 0;
 
@@ -94,7 +94,7 @@ bool GenericBitsetFrontEnd::Update() {
   m_elements.assign(size, ValueObjectSP());
   m_first =
       m_backend.GetChildMemberWithName(GetDataContainerMemberName()).get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP GenericBitsetFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
index 7415e915844f..57331eaa9868 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
@@ -44,7 +44,7 @@ public:
   size_t CalculateNumChildren() override { return m_has_value ? 1U : 0U; }
 
   ValueObjectSP GetChildAtIndex(size_t idx) override;
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 private:
   bool m_has_value = false;
@@ -61,7 +61,7 @@ GenericOptionalFrontend::GenericOptionalFrontend(ValueObject &valobj,
   }
 }
 
-bool GenericOptionalFrontend::Update() {
+lldb::ChildCacheState GenericOptionalFrontend::Update() {
   ValueObjectSP engaged_sp;
 
   if (m_stdlib == StdLib::LibCxx)
@@ -71,14 +71,14 @@ bool GenericOptionalFrontend::Update() {
                      ->GetChildMemberWithName("_M_engaged");
 
   if (!engaged_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // _M_engaged/__engaged is a bool flag and is true if the optional contains a
   // value. Converting it to unsigned gives us a size of 1 if it contains a
   // value and 0 if not.
   m_has_value = engaged_sp->GetValueAsUnsigned(0) != 0;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP GenericOptionalFrontend::GetChildAtIndex(size_t _idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index d0bdbe1fd4d9..7893aa7cc1f9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -231,21 +231,22 @@ lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
   m_pair_sp.reset();
   m_pair_ptr = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // this must be a ValueObject* because it is a child of the ValueObject we
   // are producing children for it if were a ValueObjectSP, we would end up
@@ -278,7 +279,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
       auto __i_(valobj_sp->GetChildMemberWithName("__i_"));
       if (!__i_) {
         m_pair_ptr = nullptr;
-        return false;
+        return lldb::ChildCacheState::eRefetch;
       }
       CompilerType pair_type(
           __i_->GetCompilerType().GetTypeTemplateArgument(0));
@@ -290,7 +291,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
           0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
       if (!pair_type) {
         m_pair_ptr = nullptr;
-        return false;
+        return lldb::ChildCacheState::eRefetch;
       }
 
       auto addr(m_pair_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS));
@@ -299,7 +300,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
         auto ts = pair_type.GetTypeSystem();
         auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
         if (!ast_ctx)
-          return false;
+          return lldb::ChildCacheState::eRefetch;
 
         // Mimick layout of std::__tree_iterator::__ptr_ and read it in
         // from process memory.
@@ -328,14 +329,14 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
              {"payload", pair_type}});
         std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
         if (!size)
-          return false;
+          return lldb::ChildCacheState::eRefetch;
         WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
         ProcessSP process_sp(target_sp->GetProcessSP());
         Status error;
         process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
                                buffer_sp->GetByteSize(), error);
         if (error.Fail())
-          return false;
+          return lldb::ChildCacheState::eRefetch;
         DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
                                 process_sp->GetAddressByteSize());
         auto pair_sp = CreateValueObjectFromData(
@@ -347,7 +348,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
     }
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
@@ -399,22 +400,22 @@ lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState lldb_private::formatters::
+    LibCxxUnorderedMapIteratorSyntheticFrontEnd::Update() {
   m_pair_sp.reset();
   m_iter_ptr = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto exprPathOptions = ValueObject::GetValueForExpressionPathOptions()
                              .DontCheckDotVsArrowSyntax()
@@ -437,7 +438,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
     auto iter_child(valobj_sp->GetChildMemberWithName("__i_"));
     if (!iter_child) {
       m_iter_ptr = nullptr;
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     }
 
     CompilerType node_type(iter_child->GetCompilerType()
@@ -455,19 +456,19 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
         0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
     if (!pair_type) {
       m_iter_ptr = nullptr;
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     }
 
     uint64_t addr = m_iter_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS);
     m_iter_ptr = nullptr;
 
     if (addr == 0 || addr == LLDB_INVALID_ADDRESS)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     auto ts = pair_type.GetTypeSystem();
     auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
     if (!ast_ctx)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     // Mimick layout of std::__hash_iterator::__node_ and read it in
     // from process memory.
@@ -489,14 +490,14 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
          {"__value_", pair_type}});
     std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
     if (!size)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
     ProcessSP process_sp(target_sp->GetProcessSP());
     Status error;
     process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
                            buffer_sp->GetByteSize(), error);
     if (error.Fail())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
                             process_sp->GetAddressByteSize());
     auto pair_sp = CreateValueObjectFromData(
@@ -505,7 +506,7 @@ bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
       m_pair_sp = pair_sp->GetChildAtIndex(2);
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
@@ -600,22 +601,23 @@ lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::GetChildAtIndex(
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() {
   m_cntrl = nullptr;
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb::ValueObjectSP cntrl_sp(valobj_sp->GetChildMemberWithName("__cntrl_"));
 
   m_cntrl = cntrl_sp.get(); // need to store the raw pointer to avoid a circular
                             // dependency
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::
@@ -689,14 +691,15 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::GetChildAtIndex(
   return lldb::ValueObjectSP();
 }
 
-bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
   if (!ptr_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   // Retrieve the actual pointer and the deleter, and clone them to give them
   // user-friendly names.
@@ -708,7 +711,7 @@ bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   if (deleter_sp)
     m_deleter_sp = deleter_sp->Clone(ConstString("deleter"));
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::
@@ -1105,7 +1108,7 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider(
 
   const std::time_t seconds = ptr_sp->GetValueAsSigned(0);
   if (seconds < chrono_timestamp_min || seconds > chrono_timestamp_max)
-    stream.Printf("timestamp=%" PRIu64 " s", static_cast<uint64_t>(seconds));
+    stream.Printf("timestamp=%" PRId64 " s", static_cast<int64_t>(seconds));
   else {
     std::array<char, 128> str;
     std::size_t size =
@@ -1113,8 +1116,8 @@ bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider(
     if (size == 0)
       return false;
 
-    stream.Printf("date/time=%s timestamp=%" PRIu64 " s", str.data(),
-                  static_cast<uint64_t>(seconds));
+    stream.Printf("date/time=%s timestamp=%" PRId64 " s", str.data(),
+                  static_cast<int64_t>(seconds));
   }
 
   return true;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index 72da6b2426ef..d823fbd76222 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -91,7 +91,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -139,7 +139,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -170,7 +170,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -190,7 +190,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -220,6 +220,10 @@ LibcxxStdVectorSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                         lldb::ValueObjectSP);
 
 SyntheticChildrenFrontEnd *
+LibcxxStdValarraySyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                          lldb::ValueObjectSP);
+
+SyntheticChildrenFrontEnd *
 LibcxxStdListSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                       lldb::ValueObjectSP);
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
index eacc60886c6e..c81b1e8012f6 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp
@@ -94,7 +94,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -110,12 +110,13 @@ lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::
     LibcxxStdAtomicSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
     : SyntheticChildrenFrontEnd(*valobj_sp) {}
 
-bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() {
   ValueObjectSP atomic_value = GetLibCxxAtomicValue(m_backend);
   if (atomic_value)
     m_real_child = GetLibCxxAtomicValue(m_backend).get();
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
index bfd7b881a728..3c33f94f9237 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -82,13 +82,13 @@ lldb::ValueObjectSP lldb_private::formatters::
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::Update() {
   m_start = nullptr;
   m_num_elements = 0;
   m_element_type = m_backend.GetCompilerType().GetTypeTemplateArgument(0);
   if (!m_element_type.IsValid())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr)) {
     m_element_size = *size;
@@ -96,7 +96,7 @@ bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
     m_start = m_backend.GetChildMemberWithName("__begin_").get();
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
index 2e2e2a8b0515..e28ef818b10f 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
@@ -109,7 +109,7 @@ public:
     return ExtractIndexFromString(name.GetCString());
   }
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 protected:
   AbstractListFrontEnd(ValueObject &valobj)
@@ -138,7 +138,7 @@ public:
 
   size_t CalculateNumChildren() override;
   ValueObjectSP GetChildAtIndex(size_t idx) override;
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 };
 
 class ListFrontEnd : public AbstractListFrontEnd {
@@ -151,7 +151,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 private:
   lldb::addr_t m_node_address = 0;
@@ -160,7 +160,7 @@ private:
 
 } // end anonymous namespace
 
-bool AbstractListFrontEnd::Update() {
+lldb::ChildCacheState AbstractListFrontEnd::Update() {
   m_loop_detected = 0;
   m_count = UINT32_MAX;
   m_head = nullptr;
@@ -180,10 +180,10 @@ bool AbstractListFrontEnd::Update() {
     list_type = list_type.GetNonReferenceType();
 
   if (list_type.GetNumTemplateArguments() == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_element_type = list_type.GetTypeTemplateArgument(0);
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool AbstractListFrontEnd::HasLoop(size_t count) {
@@ -284,22 +284,22 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(size_t idx) {
                                    m_element_type);
 }
 
-bool ForwardListFrontEnd::Update() {
+lldb::ChildCacheState ForwardListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
 
   Status err;
   ValueObjectSP backend_addr(m_backend.AddressOf(err));
   if (err.Fail() || !backend_addr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__before_begin_"));
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   impl_sp = GetFirstValueOfLibCXXCompressedPair(*impl_sp);
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_head = impl_sp->GetChildMemberWithName("__next_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp)
@@ -394,7 +394,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(size_t idx) {
                                    m_element_type);
 }
 
-bool ListFrontEnd::Update() {
+lldb::ChildCacheState ListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
   m_tail = nullptr;
   m_node_address = 0;
@@ -402,16 +402,16 @@ bool ListFrontEnd::Update() {
   Status err;
   ValueObjectSP backend_addr(m_backend.AddressOf(err));
   if (err.Fail() || !backend_addr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_node_address = backend_addr->GetValueAsUnsigned(0);
   if (!m_node_address || m_node_address == LLDB_INVALID_ADDRESS)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__end_"));
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_head = impl_sp->GetChildMemberWithName("__next_").get();
   m_tail = impl_sp->GetChildMemberWithName("__prev_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index d3ee63a35e10..d208acfc9da4 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -181,7 +181,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -405,15 +405,16 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetChildAtIndex(
   return potential_child_sp;
 }
 
-bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() {
   m_count = UINT32_MAX;
   m_tree = m_root_node = nullptr;
   m_iterators.clear();
   m_tree = m_backend.GetChildMemberWithName("__tree_").get();
   if (!m_tree)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_root_node = m_tree->GetChildMemberWithName("__begin_node_").get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
index c31940af0881..83f93b16fc9a 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp
@@ -26,7 +26,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   size_t CalculateNumChildren() override {
     return m_container_sp ? m_container_sp->GetNumChildren() : 0;
@@ -47,13 +47,13 @@ private:
 };
 } // namespace
 
-bool QueueFrontEnd::Update() {
+lldb::ChildCacheState QueueFrontEnd::Update() {
   m_container_sp = nullptr;
   ValueObjectSP c_sp = m_backend.GetChildMemberWithName("c");
   if (!c_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_container_sp = c_sp->GetSyntheticValue().get();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 SyntheticChildrenFrontEnd *
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
index 6aeb557a95ff..c032d67c66cb 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp
@@ -38,7 +38,7 @@ public:
     return m_range_sp;
   }
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override { return true; }
 
@@ -59,17 +59,18 @@ lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::
     Update();
 }
 
-bool lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdRangesRefViewSyntheticFrontEnd::Update() {
   ValueObjectSP range_ptr =
       GetChildMemberWithName(m_backend, {ConstString("__range_")});
   if (!range_ptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   lldb_private::Status error;
   m_range_sp = range_ptr->Dereference(error);
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 lldb_private::SyntheticChildrenFrontEnd *
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
index ec062ed21ee4..4ddfaef9c0ad 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp
@@ -53,7 +53,7 @@ public:
   // This function checks for a '__size' member to determine the number
   // of elements in the span. If no such member exists, we get the size
   // from the only other place it can be: the template argument.
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -93,12 +93,13 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::GetChildAtIndex(
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
   // Get element type.
   ValueObjectSP data_type_finder_sp = GetChildMemberWithName(
       m_backend, {ConstString("__data_"), ConstString("__data")});
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType();
 
@@ -122,7 +123,7 @@ bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() {
     }
   }
 
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
index 9024ed4dba45..546871012d2b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp
@@ -25,7 +25,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_elements.size(); }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -40,7 +40,7 @@ private:
 };
 }
 
-bool TupleFrontEnd::Update() {
+lldb::ChildCacheState TupleFrontEnd::Update() {
   m_elements.clear();
   m_base = nullptr;
 
@@ -51,11 +51,11 @@ bool TupleFrontEnd::Update() {
     base_sp = m_backend.GetChildMemberWithName("base_");
   }
   if (!base_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_base = base_sp.get();
   m_elements.assign(base_sp->GetCompilerType().GetNumDirectBaseClasses(),
                     nullptr);
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP TupleFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 1a85d37ebf0c..4cac52f235a1 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -37,7 +37,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -193,41 +193,41 @@ lldb::ValueObjectSP lldb_private::formatters::
                                    m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
-    Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::Update() {
   m_num_elements = 0;
   m_next_element = nullptr;
   m_elements_cache.clear();
   ValueObjectSP table_sp = m_backend.GetChildMemberWithName("__table_");
   if (!table_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP p2_sp = table_sp->GetChildMemberWithName("__p2_");
   if (!p2_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*p2_sp);
   if (!num_elements_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP p1_sp = table_sp->GetChildMemberWithName("__p1_");
   if (!p1_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP value_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
   if (!value_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_tree = value_sp->GetChildMemberWithName("__next_").get();
   if (m_tree == nullptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_num_elements = num_elements_sp->GetValueAsUnsigned(0);
 
   if (m_num_elements > 0)
     m_next_element = m_tree;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp
new file mode 100644
index 000000000000..7c8fd25fd9f2
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp
@@ -0,0 +1,145 @@
+//===-- LibCxxValarray.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibCxx.h"
+
+#include "lldb/Core/ValueObject.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::formatters;
+
+namespace lldb_private {
+namespace formatters {
+class LibcxxStdValarraySyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  ~LibcxxStdValarraySyntheticFrontEnd() override;
+
+  size_t CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  bool MightHaveChildren() override;
+
+  size_t GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  /// A non-owning pointer to valarray's __begin_ member.
+  ValueObject *m_start = nullptr;
+  /// A non-owning pointer to valarray's __end_ member.
+  ValueObject *m_finish = nullptr;
+  /// The type of valarray's template argument T.
+  CompilerType m_element_type;
+  /// The sizeof valarray's template argument T.
+  uint32_t m_element_size = 0;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    LibcxxStdValarraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    ~LibcxxStdValarraySyntheticFrontEnd() {
+  // these need to stay around because they are child objects who will follow
+  // their parent's life cycle
+  // delete m_start;
+  // delete m_finish;
+}
+
+size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    CalculateNumChildren() {
+  if (!m_start || !m_finish)
+    return 0;
+  uint64_t start_val = m_start->GetValueAsUnsigned(0);
+  uint64_t finish_val = m_finish->GetValueAsUnsigned(0);
+
+  if (start_val == 0 || finish_val == 0)
+    return 0;
+
+  if (start_val >= finish_val)
+    return 0;
+
+  size_t num_children = (finish_val - start_val);
+  if (num_children % m_element_size)
+    return 0;
+  return num_children / m_element_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::GetChildAtIndex(
+    size_t idx) {
+  if (!m_start || !m_finish)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::Update() {
+  m_start = m_finish = nullptr;
+
+  CompilerType type = m_backend.GetCompilerType();
+  if (type.GetNumTemplateArguments() == 0)
+    return ChildCacheState::eRefetch;
+
+  m_element_type = type.GetTypeTemplateArgument(0);
+  if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr))
+    m_element_size = *size;
+
+  if (m_element_size == 0)
+    return ChildCacheState::eRefetch;
+
+  ValueObjectSP start = m_backend.GetChildMemberWithName("__begin_");
+  ValueObjectSP finish = m_backend.GetChildMemberWithName("__end_");
+
+  if (!start || !finish)
+    return ChildCacheState::eRefetch;
+
+  m_start = start.get();
+  m_finish = finish.get();
+
+  return ChildCacheState::eRefetch;
+}
+
+bool lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    MightHaveChildren() {
+  return true;
+}
+
+size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_start || !m_finish)
+    return std::numeric_limits<size_t>::max();
+  return ExtractIndexFromString(name.GetCString());
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+  return new LibcxxStdValarraySyntheticFrontEnd(valobj_sp);
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
index e863ccca2be8..ecbb7cf0ca2b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp
@@ -204,7 +204,7 @@ public:
   }
 
   bool MightHaveChildren() override { return true; }
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
   size_t CalculateNumChildren() override { return m_size; }
   ValueObjectSP GetChildAtIndex(size_t idx) override;
 
@@ -213,24 +213,24 @@ private:
 };
 } // namespace
 
-bool VariantFrontEnd::Update() {
+lldb::ChildCacheState VariantFrontEnd::Update() {
   m_size = 0;
   ValueObjectSP impl_sp = formatters::GetChildMemberWithName(
       m_backend, {ConstString("__impl_"), ConstString("__impl")});
   if (!impl_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   LibcxxVariantIndexValidity validity = LibcxxVariantGetIndexValidity(impl_sp);
 
   if (validity == LibcxxVariantIndexValidity::Invalid)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (validity == LibcxxVariantIndexValidity::NPos)
-    return true;
+    return lldb::ChildCacheState::eReuse;
 
   m_size = 1;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 ValueObjectSP VariantFrontEnd::GetChildAtIndex(size_t idx) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
index 9d88fcf99530..0c3c3f02b60c 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
@@ -29,7 +29,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -50,7 +50,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override { return true; }
 
@@ -116,17 +116,18 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::GetChildAtIndex(
                                       m_element_type);
 }
 
-bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
   m_start = m_finish = nullptr;
   ValueObjectSP data_type_finder_sp(
       m_backend.GetChildMemberWithName("__end_cap_"));
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   data_type_finder_sp =
       GetFirstValueOfLibCXXCompressedPair(*data_type_finder_sp);
   if (!data_type_finder_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_element_type = data_type_finder_sp->GetCompilerType().GetPointeeType();
   if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr)) {
@@ -138,7 +139,7 @@ bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() {
       m_finish = m_backend.GetChildMemberWithName("__end_").get();
     }
   }
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::
@@ -226,29 +227,30 @@ lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::GetChildAtIndex(
  }
  }*/
 
-bool lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   ValueObjectSP size_sp(valobj_sp->GetChildMemberWithName("__size_"));
   if (!size_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_count = size_sp->GetValueAsUnsigned(0);
   if (!m_count)
-    return true;
+    return lldb::ChildCacheState::eReuse;
   ValueObjectSP begin_sp(valobj_sp->GetChildMemberWithName("__begin_"));
   if (!begin_sp) {
     m_count = 0;
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
   m_base_data_address = begin_sp->GetValueAsUnsigned(0);
   if (!m_base_data_address) {
     m_count = 0;
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
index 23af50fdb712..411551839e1e 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
@@ -47,7 +47,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -68,7 +68,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -94,29 +94,29 @@ LibstdcppMapIteratorSyntheticFrontEnd::LibstdcppMapIteratorSyntheticFrontEnd(
     Update();
 }
 
-bool LibstdcppMapIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibstdcppMapIteratorSyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   TargetSP target_sp(valobj_sp->GetTargetSP());
 
   if (!target_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   bool is_64bit = (target_sp->GetArchitecture().GetAddressByteSize() == 8);
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   ValueObjectSP _M_node_sp(valobj_sp->GetChildMemberWithName("_M_node"));
   if (!_M_node_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_pair_address = _M_node_sp->GetValueAsUnsigned(0);
   if (m_pair_address == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_pair_address += (is_64bit ? 32 : 16);
 
@@ -124,12 +124,12 @@ bool LibstdcppMapIteratorSyntheticFrontEnd::Update() {
   if (my_type.GetNumTemplateArguments() >= 1) {
     CompilerType pair_type = my_type.GetTypeTemplateArgument(0);
     if (!pair_type)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     m_pair_type = pair_type;
   } else
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 size_t LibstdcppMapIteratorSyntheticFrontEnd::CalculateNumChildren() {
@@ -193,22 +193,22 @@ lldb_private::formatters::VectorIteratorSyntheticFrontEnd::
     Update();
 }
 
-bool VectorIteratorSyntheticFrontEnd::Update() {
+lldb::ChildCacheState VectorIteratorSyntheticFrontEnd::Update() {
   m_item_sp.reset();
 
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP item_ptr =
       formatters::GetChildMemberWithName(*valobj_sp, m_item_names);
   if (!item_ptr)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (item_ptr->GetValueAsUnsigned(0) == 0)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   Status err;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   m_item_sp = CreateValueObjectFromAddress(
@@ -216,7 +216,7 @@ bool VectorIteratorSyntheticFrontEnd::Update() {
       item_ptr->GetCompilerType().GetPointeeType());
   if (err.Fail())
     m_item_sp.reset();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 size_t VectorIteratorSyntheticFrontEnd::CalculateNumChildren() { return 1; }
@@ -390,23 +390,23 @@ LibStdcppSharedPtrSyntheticFrontEnd::GetChildAtIndex(size_t idx) {
   return lldb::ValueObjectSP();
 }
 
-bool LibStdcppSharedPtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppSharedPtrSyntheticFrontEnd::Update() {
   auto backend = m_backend.GetSP();
   if (!backend)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto valobj_sp = backend->GetNonSyntheticValue();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   auto ptr_obj_sp = valobj_sp->GetChildMemberWithName("_M_ptr");
   if (!ptr_obj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   m_ptr_obj = ptr_obj_sp->Clone(ConstString("pointer")).get();
   m_obj_obj = nullptr;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppSharedPtrSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
index f1bfeae5099b..189f9561e52a 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -53,12 +53,12 @@ LibStdcppTupleSyntheticFrontEnd::LibStdcppTupleSyntheticFrontEnd(
   Update();
 }
 
-bool LibStdcppTupleSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppTupleSyntheticFrontEnd::Update() {
   m_members.clear();
 
   ValueObjectSP valobj_backend_sp = m_backend.GetSP();
   if (!valobj_backend_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   ValueObjectSP next_child_sp = valobj_backend_sp->GetNonSyntheticValue();
   while (next_child_sp != nullptr) {
@@ -83,7 +83,7 @@ bool LibStdcppTupleSyntheticFrontEnd::Update() {
     }
   }
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppTupleSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
index a84d641b57bc..3b0f6329d0e3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
@@ -30,7 +30,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -84,11 +84,11 @@ ValueObjectSP LibStdcppUniquePtrSyntheticFrontEnd::GetTuple() {
   return obj_child_sp;
 }
 
-bool LibStdcppUniquePtrSyntheticFrontEnd::Update() {
+lldb::ChildCacheState LibStdcppUniquePtrSyntheticFrontEnd::Update() {
   ValueObjectSP tuple_sp = GetTuple();
 
   if (!tuple_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
 
   std::unique_ptr<SyntheticChildrenFrontEnd> tuple_frontend(
       LibStdcppTupleSyntheticFrontEndCreator(nullptr, tuple_sp));
@@ -110,7 +110,7 @@ bool LibStdcppUniquePtrSyntheticFrontEnd::Update() {
   }
   m_obj_obj = nullptr;
 
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool LibStdcppUniquePtrSyntheticFrontEnd::MightHaveChildren() { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index f1a7e04bc9d1..64047dc53236 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -1044,7 +1044,9 @@ public:
     return lldb::ValueObjectSP();
   }
 
-  bool Update() override { return false; }
+  lldb::ChildCacheState Update() override {
+    return lldb::ChildCacheState::eRefetch;
+  }
 
   bool MightHaveChildren() override { return false; }
 
diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp
index 7d0004c572ed..09bf7a23d609 100644
--- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp
@@ -54,7 +54,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override = 0;
+  lldb::ChildCacheState Update() override = 0;
 
   bool MightHaveChildren() override;
 
@@ -81,7 +81,7 @@ public:
 
   ~GenericNSArrayMSyntheticFrontEnd() override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
 protected:
   lldb::addr_t GetDataAddress() override;
@@ -218,7 +218,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -306,7 +306,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -323,7 +323,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -500,9 +500,8 @@ lldb_private::formatters::NSArrayMSyntheticFrontEndBase::GetChildAtIndex(
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::
-  GenericNSArrayMSyntheticFrontEnd<D32, D64>::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSArrayMSyntheticFrontEnd<D32, D64>::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   delete m_data_32;
@@ -510,13 +509,13 @@ lldb_private::formatters::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   if (m_ptr_size == 4) {
@@ -529,7 +528,8 @@ lldb_private::formatters::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool
@@ -641,9 +641,9 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
 }
 
 template <typename D32, typename D64, bool Inline>
-bool
-lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
-  Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64,
+                                                           Inline>::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   delete m_data_32;
@@ -651,13 +651,13 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   if (m_ptr_size == 4) {
@@ -670,7 +670,8 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd<D32, D64, Inline>::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64, bool Inline>
@@ -723,8 +724,9 @@ lldb_private::formatters::NSArray0SyntheticFrontEnd::CalculateNumChildren() {
   return 0;
 }
 
-bool lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() {
-  return false;
+lldb::ChildCacheState
+lldb_private::formatters::NSArray0SyntheticFrontEnd::Update() {
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSArray0SyntheticFrontEnd::MightHaveChildren() {
@@ -757,8 +759,9 @@ lldb_private::formatters::NSArray1SyntheticFrontEnd::CalculateNumChildren() {
   return 1;
 }
 
-bool lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() {
-  return false;
+lldb::ChildCacheState
+lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() {
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSArray1SyntheticFrontEnd::MightHaveChildren() {
diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
index d377ee74ccc0..9c252a98de83 100644
--- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
@@ -107,7 +107,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -148,7 +148,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -180,7 +180,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -213,7 +213,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -234,7 +234,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -266,9 +266,9 @@ namespace Foundation1100 {
     size_t CalculateNumChildren() override;
     
     lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
-    
-    bool Update() override;
-    
+
+    lldb::ChildCacheState Update() override;
+
     bool MightHaveChildren() override;
     
     size_t GetIndexOfChildWithName(ConstString name) override;
@@ -613,7 +613,8 @@ size_t lldb_private::formatters::NSDictionaryISyntheticFrontEnd::
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
   m_children.clear();
   delete m_data_32;
   m_data_32 = nullptr;
@@ -622,13 +623,13 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
   m_ptr_size = 0;
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -642,9 +643,9 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
                            error);
   }
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_data_ptr = data_location + m_ptr_size;
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::
@@ -750,20 +751,23 @@ size_t lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::
   return m_hashtable.GetCount();
 }
 
-bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
-  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref);
+  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::
@@ -881,30 +885,33 @@ size_t lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::
   return m_size;
 }
 
-bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() {
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t valobj_addr = valobj_sp->GetValueAsUnsigned(0);
   m_size = process_sp->ReadUnsignedIntegerFromMemory(
       valobj_addr + 2 * m_ptr_size, m_ptr_size, 0, error);
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_keys_ptr =
       process_sp->ReadPointerFromMemory(valobj_addr + 3 * m_ptr_size, error);
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_objects_ptr =
       process_sp->ReadPointerFromMemory(valobj_addr + 4 * m_ptr_size, error);
-  return !error.Fail();
+
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::
@@ -992,9 +999,10 @@ size_t lldb_private::formatters::NSDictionary1SyntheticFrontEnd::
   return 1;
 }
 
-bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() {
   m_pair.reset();
-  return false;
+  return lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd::
@@ -1087,9 +1095,9 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::Calcul
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
-  Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,
+                                                                D64>::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -1098,13 +1106,13 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -1118,7 +1126,8 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd<D32,D64>::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64>
@@ -1249,9 +1258,8 @@ lldb_private::formatters::Foundation1100::
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool
-lldb_private::formatters::Foundation1100::
-  NSDictionaryMSyntheticFrontEnd::Update() {
+lldb::ChildCacheState lldb_private::formatters::Foundation1100::
+    NSDictionaryMSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -1260,13 +1268,13 @@ lldb_private::formatters::Foundation1100::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
@@ -1280,7 +1288,8 @@ lldb_private::formatters::Foundation1100::
                            error);
   }
 
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 bool
diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp
index 99eeb2d5092f..ce52ae542a50 100644
--- a/lldb/source/Plugins/Language/ObjC/NSError.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp
@@ -133,17 +133,17 @@ public:
     return m_child_sp;
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_child_ptr = nullptr;
     m_child_sp.reset();
 
     ProcessSP process_sp(m_backend.GetProcessSP());
     if (!process_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     lldb::addr_t userinfo_location = DerefToNSErrorPointer(m_backend);
     if (userinfo_location == LLDB_INVALID_ADDRESS)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     size_t ptr_size = process_sp->GetAddressByteSize();
 
@@ -152,17 +152,17 @@ public:
     lldb::addr_t userinfo =
         process_sp->ReadPointerFromMemory(userinfo_location, error);
     if (userinfo == LLDB_INVALID_ADDRESS || error.Fail())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     InferiorSizedWord isw(userinfo, *process_sp);
     TypeSystemClangSP scratch_ts_sp =
         ScratchTypeSystemClang::GetForTarget(process_sp->GetTarget());
     if (!scratch_ts_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
     m_child_sp = CreateValueObjectFromData(
         "_userInfo", isw.GetAsData(process_sp->GetByteOrder()),
         m_backend.GetExecutionContextRef(),
         scratch_ts_sp->GetBasicType(lldb::eBasicTypeObjCID));
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSException.cpp b/lldb/source/Plugins/Language/ObjC/NSException.cpp
index 29805bb2d5fe..e8011e5d2ca0 100644
--- a/lldb/source/Plugins/Language/ObjC/NSException.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSException.cpp
@@ -137,14 +137,17 @@ public:
     return lldb::ValueObjectSP();
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_name_sp.reset();
     m_reason_sp.reset();
     m_userinfo_sp.reset();
     m_reserved_sp.reset();
 
-    return ExtractFields(m_backend, &m_name_sp, &m_reason_sp, &m_userinfo_sp,
-                         &m_reserved_sp);
+    const auto ret = ExtractFields(m_backend, &m_name_sp, &m_reason_sp,
+                                   &m_userinfo_sp, &m_reserved_sp);
+
+    return ret ? lldb::ChildCacheState::eReuse
+               : lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return true; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
index 2a4ce80224e9..69e6ab1055d8 100644
--- a/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSIndexPath.cpp
@@ -46,17 +46,17 @@ public:
     return m_impl.GetIndexAtIndex(idx, m_uint_star_type);
   }
 
-  bool Update() override {
+  lldb::ChildCacheState Update() override {
     m_impl.Clear();
 
     auto type_system = m_backend.GetCompilerType().GetTypeSystem();
     if (!type_system)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     auto ast = ScratchTypeSystemClang::GetForTarget(
         *m_backend.GetExecutionContextRef().GetTargetSP());
     if (!ast)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     m_uint_star_type = ast->GetPointerSizedIntType(false);
 
@@ -65,18 +65,18 @@ public:
 
     ProcessSP process_sp = m_backend.GetProcessSP();
     if (!process_sp)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     ObjCLanguageRuntime *runtime = ObjCLanguageRuntime::Get(*process_sp);
 
     if (!runtime)
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     ObjCLanguageRuntime::ClassDescriptorSP descriptor(
         runtime->GetClassDescriptor(m_backend));
 
     if (!descriptor.get() || !descriptor->IsValid())
-      return false;
+      return lldb::ChildCacheState::eRefetch;
 
     uint64_t info_bits(0), value_bits(0), payload(0);
 
@@ -119,7 +119,7 @@ public:
         }
       }
     }
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   }
 
   bool MightHaveChildren() override { return m_impl.m_mode != Mode::Invalid; }
diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp
index ed1751cc128c..ede64852d9a8 100644
--- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp
@@ -50,7 +50,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -88,7 +88,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -121,7 +121,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -237,7 +237,7 @@ public:
 
   lldb::ValueObjectSP GetChildAtIndex(size_t idx) override;
 
-  bool Update() override;
+  lldb::ChildCacheState Update() override;
 
   bool MightHaveChildren() override;
 
@@ -426,7 +426,8 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::CalculateNumChildren() {
   return (m_data_32 ? m_data_32->_used : m_data_64->_used);
 }
 
-bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
   m_children.clear();
   delete m_data_32;
   m_data_32 = nullptr;
@@ -435,13 +436,13 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
   m_ptr_size = 0;
   ValueObjectSP valobj_sp = m_backend.GetSP();
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   Status error;
@@ -455,9 +456,9 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
                            error);
   }
   if (error.Fail())
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_data_ptr = data_location + m_ptr_size;
-  return true;
+  return lldb::ChildCacheState::eReuse;
 }
 
 bool lldb_private::formatters::NSSetISyntheticFrontEnd::MightHaveChildren() {
@@ -561,20 +562,23 @@ lldb_private::formatters::NSCFSetSyntheticFrontEnd::CalculateNumChildren() {
   return m_hashtable.GetCount();
 }
 
-bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
 
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   m_order = process_sp->GetByteOrder();
-  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref);
+  return m_hashtable.Update(valobj_sp->GetValueAsUnsigned(0), m_exe_ctx_ref)
+             ? lldb::ChildCacheState::eReuse
+             : lldb::ChildCacheState::eRefetch;
 }
 
 bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::MightHaveChildren() {
@@ -701,9 +705,8 @@ lldb_private::formatters::
 }
 
 template <typename D32, typename D64>
-bool
-lldb_private::formatters::
-  GenericNSSetMSyntheticFrontEnd<D32, D64>::Update() {
+lldb::ChildCacheState
+lldb_private::formatters::GenericNSSetMSyntheticFrontEnd<D32, D64>::Update() {
   m_children.clear();
   ValueObjectSP valobj_sp = m_backend.GetSP();
   m_ptr_size = 0;
@@ -712,13 +715,13 @@ lldb_private::formatters::
   delete m_data_64;
   m_data_64 = nullptr;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   if (!valobj_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
-    return false;
+    return lldb::ChildCacheState::eRefetch;
   m_ptr_size = process_sp->GetAddressByteSize();
   uint64_t data_location = valobj_sp->GetValueAsUnsigned(0) + m_ptr_size;
   Status error;
@@ -731,7 +734,8 @@ lldb_private::formatters::
     process_sp->ReadMemory(data_location, m_data_64, sizeof(D64),
                            error);
   }
-  return error.Success();
+  return error.Success() ? lldb::ChildCacheState::eReuse
+                         : lldb::ChildCacheState::eRefetch;
 }
 
 template <typename D32, typename D64>
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
index 4fed6e15eda3..2fbb6caad811 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteAppleXR.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
+#define LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
+
 #include "PlatformRemoteDarwinDevice.h"
 
 namespace lldb_private {
@@ -36,3 +39,5 @@ protected:
   llvm::StringRef GetPlatformName() override;
 };
 } // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_PLATFORM_MACOSX_PLATFORMREMOTEAPPLEXR_H
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
index 19e0986ace31..9c620e4807e3 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
@@ -48,20 +48,38 @@ static Status EnsureFDFlags(int fd, int flags) {
   return error;
 }
 
+static Status CanTrace() {
+  int proc_debug, ret;
+  size_t len = sizeof(proc_debug);
+  ret = ::sysctlbyname("security.bsd.unprivileged_proc_debug", &proc_debug,
+                       &len, nullptr, 0);
+  if (ret != 0)
+    return Status("sysctlbyname() security.bsd.unprivileged_proc_debug failed");
+
+  if (proc_debug < 1)
+    return Status(
+        "process debug disabled by security.bsd.unprivileged_proc_debug oid");
+
+  return {};
+}
+
 // Public Static Methods
 
 llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
 NativeProcessFreeBSD::Manager::Launch(ProcessLaunchInfo &launch_info,
                                       NativeDelegate &native_delegate) {
   Log *log = GetLog(POSIXLog::Process);
-
   Status status;
+
   ::pid_t pid = ProcessLauncherPosixFork()
                     .LaunchProcess(launch_info, status)
                     .GetProcessId();
   LLDB_LOG(log, "pid = {0:x}", pid);
   if (status.Fail()) {
+    auto error = CanTrace();
     LLDB_LOG(log, "failed to launch process: {0}", status);
+    if (status.Fail())
+      return error.ToError();
     return status.ToError();
   }
 
@@ -392,8 +410,11 @@ Status NativeProcessFreeBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
   ret =
       ptrace(req, static_cast<::pid_t>(pid), static_cast<caddr_t>(addr), data);
 
-  if (ret == -1)
-    error.SetErrorToErrno();
+  if (ret == -1) {
+    error = CanTrace();
+    if (error.Success())
+      error.SetErrorToErrno();
+  }
 
   if (result)
     *result = ret;
@@ -707,8 +728,12 @@ Status NativeProcessFreeBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size,
 Status NativeProcessFreeBSD::GetLoadedModuleFileSpec(const char *module_path,
                                                      FileSpec &file_spec) {
   Status error = PopulateMemoryRegionCache();
-  if (error.Fail())
+  if (error.Fail()) {
+    auto status = CanTrace();
+    if (status.Fail())
+      return status;
     return error;
+  }
 
   FileSpec module_file_spec(module_path);
   FileSystem::Instance().Resolve(module_file_spec);
@@ -729,8 +754,12 @@ NativeProcessFreeBSD::GetFileLoadAddress(const llvm::StringRef &file_name,
                                          lldb::addr_t &load_addr) {
   load_addr = LLDB_INVALID_ADDRESS;
   Status error = PopulateMemoryRegionCache();
-  if (error.Fail())
+  if (error.Fail()) {
+    auto status = CanTrace();
+    if (status.Fail())
+      return status;
     return error;
+  }
 
   FileSpec file(file_name);
   for (const auto &it : m_mem_region_cache) {
diff --git a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
index 997b59085110..abfbdb1a970d 100644
--- a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
+++ b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
@@ -50,7 +50,7 @@ private:
 class ProcessFreeBSDKernelKVM : public ProcessFreeBSDKernel {
 public:
   ProcessFreeBSDKernelKVM(lldb::TargetSP target_sp, lldb::ListenerSP listener,
-                          kvm_t *fvc);
+                          kvm_t *fvc, const FileSpec &core_file);
 
   ~ProcessFreeBSDKernelKVM();
 
diff --git a/lldb/source/Symbol/SymbolLocator.cpp b/lldb/source/Symbol/SymbolLocator.cpp
index 918f13ed9c19..93a5bc428b61 100644
--- a/lldb/source/Symbol/SymbolLocator.cpp
+++ b/lldb/source/Symbol/SymbolLocator.cpp
@@ -10,6 +10,7 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Host/Host.h"
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/ThreadPool.h"
@@ -18,12 +19,10 @@ using namespace lldb;
 using namespace lldb_private;
 
 void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
-  if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup())
-    return;
-
   static llvm::SmallSet<UUID, 8> g_seen_uuids;
   static std::mutex g_mutex;
-  Debugger::GetThreadPool().async([=]() {
+
+  auto lookup = [=]() {
     {
       std::lock_guard<std::mutex> guard(g_mutex);
       if (g_seen_uuids.count(uuid))
@@ -36,12 +35,23 @@ void SymbolLocator::DownloadSymbolFileAsync(const UUID &uuid) {
     module_spec.GetUUID() = uuid;
     if (!PluginManager::DownloadObjectAndSymbolFile(module_spec, error,
                                                     /*force_lookup=*/true,
-                                                    /*copy_executable=*/false))
+                                                    /*copy_executable=*/true))
       return;
 
     if (error.Fail())
       return;
 
     Debugger::ReportSymbolChange(module_spec);
-  });
+  };
+
+  switch (ModuleList::GetGlobalModuleListProperties().GetSymbolAutoDownload()) {
+  case eSymbolDownloadOff:
+    break;
+  case eSymbolDownloadBackground:
+    Debugger::GetThreadPool().async(lookup);
+    break;
+  case eSymbolDownloadForeground:
+    lookup();
+    break;
+  };
 }
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index 9706f9e94e92..c30631589596 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -11,7 +11,7 @@ from lldbsuite.test import lldbutil
 
 class LibcxxChronoDataFormatterTestCase(TestBase):
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<", "11.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
         self.build()
@@ -54,17 +54,16 @@ class LibcxxChronoDataFormatterTestCase(TestBase):
             substrs=["ss_0 = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
         )
 
-        # FIXME disabled temporarily, macOS is printing this as an unsigned?
-        #self.expect(
-        #    "frame variable ss_neg_date_time",
-        #    substrs=[
-        #        "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
-        #    ],
-        #)
-        #self.expect(
-        #    "frame variable ss_neg_seconds",
-        #    substrs=["ss_neg_seconds = timestamp=-1096193779201 s"],
-        #)
+        self.expect(
+            "frame variable ss_neg_date_time",
+            substrs=[
+                "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
+            ],
+        )
+        self.expect(
+            "frame variable ss_neg_seconds",
+            substrs=["ss_neg_seconds = timestamp=-1096193779201 s"],
+        )
 
         self.expect(
             "frame variable ss_pos_date_time",
@@ -77,11 +76,10 @@ class LibcxxChronoDataFormatterTestCase(TestBase):
             substrs=["ss_pos_seconds = timestamp=971890963200 s"],
         )
 
-        # FIXME disabled temporarily, macOS is printing this as an unsigned?
-        #self.expect(
-        #    "frame variable ss_min",
-        #    substrs=["ss_min = timestamp=-9223372036854775808 s"],
-        #)
+        self.expect(
+            "frame variable ss_min",
+            substrs=["ss_min = timestamp=-9223372036854775808 s"],
+        )
         self.expect(
             "frame variable ss_max",
             substrs=["ss_max = timestamp=9223372036854775807 s"],
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile
new file mode 100644
index 000000000000..c5df567e01a2
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/Makefile
@@ -0,0 +1,5 @@
+CXX_SOURCES := main.cpp
+
+USE_LIBCPP := 1
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
new file mode 100644
index 000000000000..7b54b3485d04
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
@@ -0,0 +1,78 @@
+"""
+Test lldb data formatter subsystem.
+"""
+
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class LibcxxChronoDataFormatterTestCase(TestBase):
+    @add_test_categories(["libc++"])
+    def test_with_run_command(self):
+        """Test that that file and class static variables display correctly."""
+        self.build()
+        (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec("main.cpp", False)
+        )
+
+        self.expect(
+            "frame variable va_int",
+            substrs=[
+                "va_int = size=4",
+                "[0] = 0",
+                "[1] = 0",
+                "[2] = 0",
+                "[3] = 0",
+                "}",
+            ],
+        )
+
+        lldbutil.continue_to_breakpoint(process, bkpt)
+        self.expect(
+            "frame variable va_int",
+            substrs=[
+                "va_int = size=4",
+                "[0] = 1",
+                "[1] = 12",
+                "[2] = 123",
+                "[3] = 1234",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable va_int[0]", substrs=["1"])
+        self.expect("frame variable va_int[1]", substrs=["12"])
+        self.expect("frame variable va_int[2]", substrs=["123"])
+        self.expect("frame variable va_int[3]", substrs=["1234"])
+        self.expect(
+            "frame variable va_int[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(valarray<int>) va_int"'],
+        )
+
+        self.expect(
+            "frame variable va_double",
+            substrs=[
+                "va_double = size=4",
+                "[0] = 1",
+                "[1] = 0.5",
+                "[2] = 0.25",
+                "[3] = 0.125",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable va_double[0]", substrs=["1"])
+        self.expect("frame variable va_double[1]", substrs=["0.5"])
+        self.expect("frame variable va_double[2]", substrs=["0.25"])
+        self.expect("frame variable va_double[3]", substrs=["0.125"])
+        self.expect(
+            "frame variable va_double[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(valarray<double>) va_double"'],
+        )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
new file mode 100644
index 000000000000..f32921e16fa1
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
@@ -0,0 +1,17 @@
+#include <iostream>
+#include <valarray>
+
+int main() {
+
+  std::valarray<int> va_int(4);
+  std::cout << "break here";
+
+  va_int[0] = 1;
+  va_int[1] = 12;
+  va_int[2] = 123;
+  va_int[3] = 1234;
+
+  std::valarray<double> va_double({1.0, 0.5, 0.25, 0.125});
+
+  std::cout << "break here\n";
+}
diff --git a/lldb/test/API/macosx/universal/Makefile b/lldb/test/API/macosx/universal/Makefile
index 8712fdecf566..7d4762f24087 100644
--- a/lldb/test/API/macosx/universal/Makefile
+++ b/lldb/test/API/macosx/universal/Makefile
@@ -14,7 +14,7 @@ testit.x86_64: testit.x86_64.o
 	$(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o testit.x86_64 $<
 
 testit.x86_64h.o: main.c
-	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9-apple-macosx10.9 -c -o testit.x86_64h.o $<
+	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64h-apple-macosx10.9 -c -o testit.x86_64h.o $<
 
 testit.x86_64.o: main.c
 	$(CC) -isysroot $(SDKROOT) -g -O0 -target x86_64-apple-macosx10.9 -c -o testit.x86_64.o $<
diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp
new file mode 100644
index 000000000000..0c33d4b114d7
--- /dev/null
+++ b/lldb/tools/lldb-dap/Breakpoint.cpp
@@ -0,0 +1,76 @@
+//===-- Breakpoint.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Breakpoint.h"
+#include "DAP.h"
+#include "JSONUtils.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace lldb_dap;
+
+void Breakpoint::SetCondition() { bp.SetCondition(condition.c_str()); }
+
+void Breakpoint::SetHitCondition() {
+  uint64_t hitCount = 0;
+  if (llvm::to_integer(hitCondition, hitCount))
+    bp.SetIgnoreCount(hitCount - 1);
+}
+
+void Breakpoint::CreateJsonObject(llvm::json::Object &object) {
+  // Each breakpoint location is treated as a separate breakpoint for VS code.
+  // They don't have the notion of a single breakpoint with multiple locations.
+  if (!bp.IsValid())
+    return;
+  object.try_emplace("verified", bp.GetNumResolvedLocations() > 0);
+  object.try_emplace("id", bp.GetID());
+  // VS Code DAP doesn't currently allow one breakpoint to have multiple
+  // locations so we just report the first one. If we report all locations
+  // then the IDE starts showing the wrong line numbers and locations for
+  // other source file and line breakpoints in the same file.
+
+  // Below we search for the first resolved location in a breakpoint and report
+  // this as the breakpoint location since it will have a complete location
+  // that is at least loaded in the current process.
+  lldb::SBBreakpointLocation bp_loc;
+  const auto num_locs = bp.GetNumLocations();
+  for (size_t i = 0; i < num_locs; ++i) {
+    bp_loc = bp.GetLocationAtIndex(i);
+    if (bp_loc.IsResolved())
+      break;
+  }
+  // If not locations are resolved, use the first location.
+  if (!bp_loc.IsResolved())
+    bp_loc = bp.GetLocationAtIndex(0);
+  auto bp_addr = bp_loc.GetAddress();
+
+  if (bp_addr.IsValid()) {
+    std::string formatted_addr =
+        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target));
+    object.try_emplace("instructionReference", formatted_addr);
+    auto line_entry = bp_addr.GetLineEntry();
+    const auto line = line_entry.GetLine();
+    if (line != UINT32_MAX)
+      object.try_emplace("line", line);
+    const auto column = line_entry.GetColumn();
+    if (column != 0)
+      object.try_emplace("column", column);
+    object.try_emplace("source", CreateSource(line_entry));
+  }
+}
+
+bool Breakpoint::MatchesName(const char *name) { return bp.MatchesName(name); }
+
+void Breakpoint::SetBreakpoint() {
+  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
+  // we add a label to our breakpoints.
+  bp.AddName(GetBreakpointLabel());
+  if (!condition.empty())
+    SetCondition();
+  if (!hitCondition.empty())
+    SetHitCondition();
+}
diff --git a/lldb/tools/lldb-dap/Breakpoint.h b/lldb/tools/lldb-dap/Breakpoint.h
new file mode 100644
index 000000000000..47a9d9c59ae2
--- /dev/null
+++ b/lldb/tools/lldb-dap/Breakpoint.h
@@ -0,0 +1,33 @@
+//===-- Breakpoint.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H
+#define LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H
+
+#include "BreakpointBase.h"
+
+namespace lldb_dap {
+
+struct Breakpoint : public BreakpointBase {
+  // The LLDB breakpoint associated wit this source breakpoint
+  lldb::SBBreakpoint bp;
+
+  Breakpoint() = default;
+  Breakpoint(const llvm::json::Object &obj) : BreakpointBase(obj){};
+  Breakpoint(lldb::SBBreakpoint bp) : bp(bp) {}
+
+  void SetCondition() override;
+  void SetHitCondition() override;
+  void CreateJsonObject(llvm::json::Object &object) override;
+
+  bool MatchesName(const char *name);
+  void SetBreakpoint();
+};
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp
index fb4b27fbe315..519729f5519f 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.cpp
+++ b/lldb/tools/lldb-dap/BreakpointBase.cpp
@@ -8,306 +8,13 @@
 
 #include "BreakpointBase.h"
 #include "DAP.h"
-#include "JSONUtils.h"
 #include "llvm/ADT/StringExtras.h"
 
 using namespace lldb_dap;
 
 BreakpointBase::BreakpointBase(const llvm::json::Object &obj)
     : condition(std::string(GetString(obj, "condition"))),
-      hitCondition(std::string(GetString(obj, "hitCondition"))),
-      logMessage(std::string(GetString(obj, "logMessage"))) {}
-
-void BreakpointBase::SetCondition() { bp.SetCondition(condition.c_str()); }
-
-void BreakpointBase::SetHitCondition() {
-  uint64_t hitCount = 0;
-  if (llvm::to_integer(hitCondition, hitCount))
-    bp.SetIgnoreCount(hitCount - 1);
-}
-
-lldb::SBError BreakpointBase::AppendLogMessagePart(llvm::StringRef part,
-                                                   bool is_expr) {
-  if (is_expr) {
-    logMessageParts.emplace_back(part, is_expr);
-  } else {
-    std::string formatted;
-    lldb::SBError error = FormatLogText(part, formatted);
-    if (error.Fail())
-      return error;
-    logMessageParts.emplace_back(formatted, is_expr);
-  }
-  return lldb::SBError();
-}
-
-// TODO: consolidate this code with the implementation in
-// FormatEntity::ParseInternal().
-lldb::SBError BreakpointBase::FormatLogText(llvm::StringRef text,
-                                            std::string &formatted) {
-  lldb::SBError error;
-  while (!text.empty()) {
-    size_t backslash_pos = text.find_first_of('\\');
-    if (backslash_pos == std::string::npos) {
-      formatted += text.str();
-      return error;
-    }
-
-    formatted += text.substr(0, backslash_pos).str();
-    // Skip the characters before and including '\'.
-    text = text.drop_front(backslash_pos + 1);
-
-    if (text.empty()) {
-      error.SetErrorString(
-          "'\\' character was not followed by another character");
-      return error;
-    }
-
-    const char desens_char = text[0];
-    text = text.drop_front(); // Skip the desensitized char character
-    switch (desens_char) {
-    case 'a':
-      formatted.push_back('\a');
-      break;
-    case 'b':
-      formatted.push_back('\b');
-      break;
-    case 'f':
-      formatted.push_back('\f');
-      break;
-    case 'n':
-      formatted.push_back('\n');
-      break;
-    case 'r':
-      formatted.push_back('\r');
-      break;
-    case 't':
-      formatted.push_back('\t');
-      break;
-    case 'v':
-      formatted.push_back('\v');
-      break;
-    case '\'':
-      formatted.push_back('\'');
-      break;
-    case '\\':
-      formatted.push_back('\\');
-      break;
-    case '0':
-      // 1 to 3 octal chars
-      {
-        if (text.empty()) {
-          error.SetErrorString("missing octal number following '\\0'");
-          return error;
-        }
-
-        // Make a string that can hold onto the initial zero char, up to 3
-        // octal digits, and a terminating NULL.
-        char oct_str[5] = {0, 0, 0, 0, 0};
-
-        size_t i;
-        for (i = 0;
-             i < text.size() && i < 4 && (text[i] >= '0' && text[i] <= '7');
-             ++i) {
-          oct_str[i] = text[i];
-        }
-
-        text = text.drop_front(i);
-        unsigned long octal_value = ::strtoul(oct_str, nullptr, 8);
-        if (octal_value <= UINT8_MAX) {
-          formatted.push_back((char)octal_value);
-        } else {
-          error.SetErrorString("octal number is larger than a single byte");
-          return error;
-        }
-      }
-      break;
-
-    case 'x': {
-      if (text.empty()) {
-        error.SetErrorString("missing hex number following '\\x'");
-        return error;
-      }
-      // hex number in the text
-      if (isxdigit(text[0])) {
-        // Make a string that can hold onto two hex chars plus a
-        // NULL terminator
-        char hex_str[3] = {0, 0, 0};
-        hex_str[0] = text[0];
-
-        text = text.drop_front();
-
-        if (!text.empty() && isxdigit(text[0])) {
-          hex_str[1] = text[0];
-          text = text.drop_front();
-        }
-
-        unsigned long hex_value = strtoul(hex_str, nullptr, 16);
-        if (hex_value <= UINT8_MAX) {
-          formatted.push_back((char)hex_value);
-        } else {
-          error.SetErrorString("hex number is larger than a single byte");
-          return error;
-        }
-      } else {
-        formatted.push_back(desens_char);
-      }
-      break;
-    }
-
-    default:
-      // Just desensitize any other character by just printing what came
-      // after the '\'
-      formatted.push_back(desens_char);
-      break;
-    }
-  }
-  return error;
-}
-
-// logMessage will be divided into array of LogMessagePart as two kinds:
-// 1. raw print text message, and
-// 2. interpolated expression for evaluation which is inside matching curly
-//    braces.
-//
-// The function tries to parse logMessage into a list of LogMessageParts
-// for easy later access in BreakpointHitCallback.
-void BreakpointBase::SetLogMessage() {
-  logMessageParts.clear();
-
-  // Contains unmatched open curly braces indices.
-  std::vector<int> unmatched_curly_braces;
-
-  // Contains all matched curly braces in logMessage.
-  // Loop invariant: matched_curly_braces_ranges are sorted by start index in
-  // ascending order without any overlap between them.
-  std::vector<std::pair<int, int>> matched_curly_braces_ranges;
-
-  lldb::SBError error;
-  // Part1 - parse matched_curly_braces_ranges.
-  // locating all curly braced expression ranges in logMessage.
-  // The algorithm takes care of nested and imbalanced curly braces.
-  for (size_t i = 0; i < logMessage.size(); ++i) {
-    if (logMessage[i] == '{') {
-      unmatched_curly_braces.push_back(i);
-    } else if (logMessage[i] == '}') {
-      if (unmatched_curly_braces.empty())
-        // Nothing to match.
-        continue;
-
-      int last_unmatched_index = unmatched_curly_braces.back();
-      unmatched_curly_braces.pop_back();
-
-      // Erase any matched ranges included in the new match.
-      while (!matched_curly_braces_ranges.empty()) {
-        assert(matched_curly_braces_ranges.back().first !=
-                   last_unmatched_index &&
-               "How can a curley brace be matched twice?");
-        if (matched_curly_braces_ranges.back().first < last_unmatched_index)
-          break;
-
-        // This is a nested range let's earse it.
-        assert((size_t)matched_curly_braces_ranges.back().second < i);
-        matched_curly_braces_ranges.pop_back();
-      }
-
-      // Assert invariant.
-      assert(matched_curly_braces_ranges.empty() ||
-             matched_curly_braces_ranges.back().first < last_unmatched_index);
-      matched_curly_braces_ranges.emplace_back(last_unmatched_index, i);
-    }
-  }
-
-  // Part2 - parse raw text and expresions parts.
-  // All expression ranges have been parsed in matched_curly_braces_ranges.
-  // The code below uses matched_curly_braces_ranges to divide logMessage
-  // into raw text parts and expression parts.
-  int last_raw_text_start = 0;
-  for (const std::pair<int, int> &curly_braces_range :
-       matched_curly_braces_ranges) {
-    // Raw text before open curly brace.
-    assert(curly_braces_range.first >= last_raw_text_start);
-    size_t raw_text_len = curly_braces_range.first - last_raw_text_start;
-    if (raw_text_len > 0) {
-      error = AppendLogMessagePart(
-          llvm::StringRef(logMessage.c_str() + last_raw_text_start,
-                          raw_text_len),
-          /*is_expr=*/false);
-      if (error.Fail()) {
-        NotifyLogMessageError(error.GetCString());
-        return;
-      }
-    }
-
-    // Expression between curly braces.
-    assert(curly_braces_range.second > curly_braces_range.first);
-    size_t expr_len = curly_braces_range.second - curly_braces_range.first - 1;
-    error = AppendLogMessagePart(
-        llvm::StringRef(logMessage.c_str() + curly_braces_range.first + 1,
-                        expr_len),
-        /*is_expr=*/true);
-    if (error.Fail()) {
-      NotifyLogMessageError(error.GetCString());
-      return;
-    }
-
-    last_raw_text_start = curly_braces_range.second + 1;
-  }
-  // Trailing raw text after close curly brace.
-  assert(last_raw_text_start >= 0);
-  if (logMessage.size() > (size_t)last_raw_text_start) {
-    error = AppendLogMessagePart(
-        llvm::StringRef(logMessage.c_str() + last_raw_text_start,
-                        logMessage.size() - last_raw_text_start),
-        /*is_expr=*/false);
-    if (error.Fail()) {
-      NotifyLogMessageError(error.GetCString());
-      return;
-    }
-  }
-
-  bp.SetCallback(BreakpointBase::BreakpointHitCallback, this);
-}
-
-void BreakpointBase::NotifyLogMessageError(llvm::StringRef error) {
-  std::string message = "Log message has error: ";
-  message += error;
-  g_dap.SendOutput(OutputType::Console, message);
-}
-
-/*static*/
-bool BreakpointBase::BreakpointHitCallback(
-    void *baton, lldb::SBProcess &process, lldb::SBThread &thread,
-    lldb::SBBreakpointLocation &location) {
-  if (!baton)
-    return true;
-
-  BreakpointBase *bp = (BreakpointBase *)baton;
-  lldb::SBFrame frame = thread.GetSelectedFrame();
-
-  std::string output;
-  for (const BreakpointBase::LogMessagePart &messagePart :
-       bp->logMessageParts) {
-    if (messagePart.is_expr) {
-      // Try local frame variables first before fall back to expression
-      // evaluation
-      const std::string &expr_str = messagePart.text;
-      const char *expr = expr_str.c_str();
-      lldb::SBValue value =
-          frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
-      if (value.GetError().Fail())
-        value = frame.EvaluateExpression(expr);
-      output += VariableDescription(value).display_value;
-    } else {
-      output += messagePart.text;
-    }
-  }
-  if (!output.empty() && output.back() != '\n')
-    output.push_back('\n'); // Ensure log message has line break.
-  g_dap.SendOutput(OutputType::Console, output.c_str());
-
-  // Do not stop.
-  return false;
-}
+      hitCondition(std::string(GetString(obj, "hitCondition"))) {}
 
 void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) {
   if (condition != request_bp.condition) {
@@ -318,10 +25,6 @@ void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) {
     hitCondition = request_bp.hitCondition;
     SetHitCondition();
   }
-  if (logMessage != request_bp.logMessage) {
-    logMessage = request_bp.logMessage;
-    SetLogMessage();
-  }
 }
 
 const char *BreakpointBase::GetBreakpointLabel() {
diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h
index 41787f786102..5a04bb201615 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.h
+++ b/lldb/tools/lldb-dap/BreakpointBase.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 #define LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H
 
-#include "JSONUtils.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "llvm/Support/JSON.h"
 #include <string>
@@ -18,44 +17,24 @@
 namespace lldb_dap {
 
 struct BreakpointBase {
-  // logMessage part can be either a raw text or an expression.
-  struct LogMessagePart {
-    LogMessagePart(llvm::StringRef text, bool is_expr)
-        : text(text), is_expr(is_expr) {}
-    std::string text;
-    bool is_expr;
-  };
+
   // An optional expression for conditional breakpoints.
   std::string condition;
   // An optional expression that controls how many hits of the breakpoint are
   // ignored. The backend is expected to interpret the expression as needed
   std::string hitCondition;
-  // If this attribute exists and is non-empty, the backend must not 'break'
-  // (stop) but log the message instead. Expressions within {} are
-  // interpolated.
-  std::string logMessage;
-  std::vector<LogMessagePart> logMessageParts;
-  // The LLDB breakpoint associated wit this source breakpoint
-  lldb::SBBreakpoint bp;
 
   BreakpointBase() = default;
   BreakpointBase(const llvm::json::Object &obj);
+  virtual ~BreakpointBase() = default;
 
-  void SetCondition();
-  void SetHitCondition();
-  void SetLogMessage();
-  void UpdateBreakpoint(const BreakpointBase &request_bp);
+  virtual void SetCondition() = 0;
+  virtual void SetHitCondition() = 0;
+  virtual void CreateJsonObject(llvm::json::Object &object) = 0;
 
-  // Format \param text and return formatted text in \param formatted.
-  // \return any formatting failures.
-  lldb::SBError FormatLogText(llvm::StringRef text, std::string &formatted);
-  lldb::SBError AppendLogMessagePart(llvm::StringRef part, bool is_expr);
-  void NotifyLogMessageError(llvm::StringRef error);
+  void UpdateBreakpoint(const BreakpointBase &request_bp);
 
   static const char *GetBreakpointLabel();
-  static bool BreakpointHitCallback(void *baton, lldb::SBProcess &process,
-                                    lldb::SBThread &thread,
-                                    lldb::SBBreakpointLocation &location);
 };
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 554567eb3b0e..f8c0e4ecf36c 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -24,6 +24,7 @@ tablegen(LLVM Options.inc -gen-opt-parser-defs)
 add_public_tablegen_target(LLDBDAPOptionsTableGen)
 add_lldb_tool(lldb-dap
   lldb-dap.cpp
+  Breakpoint.cpp
   BreakpointBase.cpp
   ExceptionBreakpoint.cpp
   FifoFiles.cpp
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
index d4bdb976500e..21743bf90870 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp
@@ -12,21 +12,13 @@
 namespace lldb_dap {
 
 FunctionBreakpoint::FunctionBreakpoint(const llvm::json::Object &obj)
-    : BreakpointBase(obj), functionName(std::string(GetString(obj, "name"))) {}
+    : Breakpoint(obj), functionName(std::string(GetString(obj, "name"))) {}
 
 void FunctionBreakpoint::SetBreakpoint() {
   if (functionName.empty())
     return;
   bp = g_dap.target.BreakpointCreateByName(functionName.c_str());
-  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
-  // we add a label to our breakpoints.
-  bp.AddName(GetBreakpointLabel());
-  if (!condition.empty())
-    SetCondition();
-  if (!hitCondition.empty())
-    SetHitCondition();
-  if (!logMessage.empty())
-    SetLogMessage();
+  Breakpoint::SetBreakpoint();
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.h b/lldb/tools/lldb-dap/FunctionBreakpoint.h
index fc23e94e1287..b15ff1931a6b 100644
--- a/lldb/tools/lldb-dap/FunctionBreakpoint.h
+++ b/lldb/tools/lldb-dap/FunctionBreakpoint.h
@@ -9,11 +9,11 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H
 #define LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H
 
-#include "BreakpointBase.h"
+#include "Breakpoint.h"
 
 namespace lldb_dap {
 
-struct FunctionBreakpoint : public BreakpointBase {
+struct FunctionBreakpoint : public Breakpoint {
   std::string functionName;
 
   FunctionBreakpoint() = default;
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index a8b438d9d6df..878449a91aa6 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -364,54 +364,14 @@ llvm::json::Value CreateScope(const llvm::StringRef name,
 //   },
 //   "required": [ "verified" ]
 // }
-llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp,
+llvm::json::Value CreateBreakpoint(BreakpointBase *bp,
                                    std::optional<llvm::StringRef> request_path,
                                    std::optional<uint32_t> request_line,
                                    std::optional<uint32_t> request_column) {
-  // Each breakpoint location is treated as a separate breakpoint for VS code.
-  // They don't have the notion of a single breakpoint with multiple locations.
   llvm::json::Object object;
-  if (!bp.IsValid())
-    return llvm::json::Value(std::move(object));
-
-  object.try_emplace("verified", bp.GetNumResolvedLocations() > 0);
-  object.try_emplace("id", bp.GetID());
-  // VS Code DAP doesn't currently allow one breakpoint to have multiple
-  // locations so we just report the first one. If we report all locations
-  // then the IDE starts showing the wrong line numbers and locations for
-  // other source file and line breakpoints in the same file.
-
-  // Below we search for the first resolved location in a breakpoint and report
-  // this as the breakpoint location since it will have a complete location
-  // that is at least loaded in the current process.
-  lldb::SBBreakpointLocation bp_loc;
-  const auto num_locs = bp.GetNumLocations();
-  for (size_t i = 0; i < num_locs; ++i) {
-    bp_loc = bp.GetLocationAtIndex(i);
-    if (bp_loc.IsResolved())
-      break;
-  }
-  // If not locations are resolved, use the first location.
-  if (!bp_loc.IsResolved())
-    bp_loc = bp.GetLocationAtIndex(0);
-  auto bp_addr = bp_loc.GetAddress();
-
   if (request_path)
     object.try_emplace("source", CreateSource(*request_path));
-
-  if (bp_addr.IsValid()) {
-    std::string formatted_addr =
-        "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target));
-    object.try_emplace("instructionReference", formatted_addr);
-    auto line_entry = bp_addr.GetLineEntry();
-    const auto line = line_entry.GetLine();
-    if (line != UINT32_MAX)
-      object.try_emplace("line", line);
-    const auto column = line_entry.GetColumn();
-    if (column != 0)
-      object.try_emplace("column", column);
-    object.try_emplace("source", CreateSource(line_entry));
-  }
+  bp->CreateJsonObject(object);
   // We try to add request_line as a fallback
   if (request_line)
     object.try_emplace("line", *request_line);
@@ -506,7 +466,7 @@ llvm::json::Value CreateModule(lldb::SBModule &module) {
   return llvm::json::Value(std::move(object));
 }
 
-void AppendBreakpoint(lldb::SBBreakpoint &bp, llvm::json::Array &breakpoints,
+void AppendBreakpoint(BreakpointBase *bp, llvm::json::Array &breakpoints,
                       std::optional<llvm::StringRef> request_path,
                       std::optional<uint32_t> request_line) {
   breakpoints.emplace_back(CreateBreakpoint(bp, request_path, request_line));
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 62338548890c..1515f5ba2e5f 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
+#include "BreakpointBase.h"
 #include "DAPForward.h"
 #include "lldb/API/SBModule.h"
 #include "llvm/ADT/StringRef.h"
@@ -191,7 +192,7 @@ void FillResponse(const llvm::json::Object &request,
 ///     provided by the setBreakpoints request are returned to the IDE as a
 ///     fallback.
 void AppendBreakpoint(
-    lldb::SBBreakpoint &bp, llvm::json::Array &breakpoints,
+    BreakpointBase *bp, llvm::json::Array &breakpoints,
     std::optional<llvm::StringRef> request_path = std::nullopt,
     std::optional<uint32_t> request_line = std::nullopt);
 
@@ -223,7 +224,7 @@ void AppendBreakpoint(
 ///     A "Breakpoint" JSON object with that follows the formal JSON
 ///     definition outlined by Microsoft.
 llvm::json::Value
-CreateBreakpoint(lldb::SBBreakpoint &bp,
+CreateBreakpoint(BreakpointBase *bp,
                  std::optional<llvm::StringRef> request_path = std::nullopt,
                  std::optional<uint32_t> request_line = std::nullopt,
                  std::optional<uint32_t> request_column = std::nullopt);
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
index 3bd83c0a6874..f5dd1346cb9e 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp
@@ -12,22 +12,308 @@
 namespace lldb_dap {
 
 SourceBreakpoint::SourceBreakpoint(const llvm::json::Object &obj)
-    : BreakpointBase(obj), line(GetUnsigned(obj, "line", 0)),
-      column(GetUnsigned(obj, "column", 0)) {}
+    : Breakpoint(obj), logMessage(std::string(GetString(obj, "logMessage"))),
+      line(GetUnsigned(obj, "line", 0)), column(GetUnsigned(obj, "column", 0)) {
+}
 
 void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) {
   lldb::SBFileSpecList module_list;
   bp = g_dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line,
                                                column, 0, module_list);
-  // See comments in BreakpointBase::GetBreakpointLabel() for details of why
-  // we add a label to our breakpoints.
-  bp.AddName(GetBreakpointLabel());
-  if (!condition.empty())
-    SetCondition();
-  if (!hitCondition.empty())
-    SetHitCondition();
   if (!logMessage.empty())
     SetLogMessage();
+  Breakpoint::SetBreakpoint();
+}
+
+void SourceBreakpoint::UpdateBreakpoint(const SourceBreakpoint &request_bp) {
+  if (logMessage != request_bp.logMessage) {
+    logMessage = request_bp.logMessage;
+    SetLogMessage();
+  }
+  BreakpointBase::UpdateBreakpoint(request_bp);
+}
+
+lldb::SBError SourceBreakpoint::AppendLogMessagePart(llvm::StringRef part,
+                                                     bool is_expr) {
+  if (is_expr) {
+    logMessageParts.emplace_back(part, is_expr);
+  } else {
+    std::string formatted;
+    lldb::SBError error = FormatLogText(part, formatted);
+    if (error.Fail())
+      return error;
+    logMessageParts.emplace_back(formatted, is_expr);
+  }
+  return lldb::SBError();
+}
+
+// TODO: consolidate this code with the implementation in
+// FormatEntity::ParseInternal().
+lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text,
+                                              std::string &formatted) {
+  lldb::SBError error;
+  while (!text.empty()) {
+    size_t backslash_pos = text.find_first_of('\\');
+    if (backslash_pos == std::string::npos) {
+      formatted += text.str();
+      return error;
+    }
+
+    formatted += text.substr(0, backslash_pos).str();
+    // Skip the characters before and including '\'.
+    text = text.drop_front(backslash_pos + 1);
+
+    if (text.empty()) {
+      error.SetErrorString(
+          "'\\' character was not followed by another character");
+      return error;
+    }
+
+    const char desens_char = text[0];
+    text = text.drop_front(); // Skip the desensitized char character
+    switch (desens_char) {
+    case 'a':
+      formatted.push_back('\a');
+      break;
+    case 'b':
+      formatted.push_back('\b');
+      break;
+    case 'f':
+      formatted.push_back('\f');
+      break;
+    case 'n':
+      formatted.push_back('\n');
+      break;
+    case 'r':
+      formatted.push_back('\r');
+      break;
+    case 't':
+      formatted.push_back('\t');
+      break;
+    case 'v':
+      formatted.push_back('\v');
+      break;
+    case '\'':
+      formatted.push_back('\'');
+      break;
+    case '\\':
+      formatted.push_back('\\');
+      break;
+    case '0':
+      // 1 to 3 octal chars
+      {
+        if (text.empty()) {
+          error.SetErrorString("missing octal number following '\\0'");
+          return error;
+        }
+
+        // Make a string that can hold onto the initial zero char, up to 3
+        // octal digits, and a terminating NULL.
+        char oct_str[5] = {0, 0, 0, 0, 0};
+
+        size_t i;
+        for (i = 0;
+             i < text.size() && i < 4 && (text[i] >= '0' && text[i] <= '7');
+             ++i) {
+          oct_str[i] = text[i];
+        }
+
+        text = text.drop_front(i);
+        unsigned long octal_value = ::strtoul(oct_str, nullptr, 8);
+        if (octal_value <= UINT8_MAX) {
+          formatted.push_back((char)octal_value);
+        } else {
+          error.SetErrorString("octal number is larger than a single byte");
+          return error;
+        }
+      }
+      break;
+
+    case 'x': {
+      if (text.empty()) {
+        error.SetErrorString("missing hex number following '\\x'");
+        return error;
+      }
+      // hex number in the text
+      if (isxdigit(text[0])) {
+        // Make a string that can hold onto two hex chars plus a
+        // NULL terminator
+        char hex_str[3] = {0, 0, 0};
+        hex_str[0] = text[0];
+
+        text = text.drop_front();
+
+        if (!text.empty() && isxdigit(text[0])) {
+          hex_str[1] = text[0];
+          text = text.drop_front();
+        }
+
+        unsigned long hex_value = strtoul(hex_str, nullptr, 16);
+        if (hex_value <= UINT8_MAX) {
+          formatted.push_back((char)hex_value);
+        } else {
+          error.SetErrorString("hex number is larger than a single byte");
+          return error;
+        }
+      } else {
+        formatted.push_back(desens_char);
+      }
+      break;
+    }
+
+    default:
+      // Just desensitize any other character by just printing what came
+      // after the '\'
+      formatted.push_back(desens_char);
+      break;
+    }
+  }
+  return error;
+}
+
+// logMessage will be divided into array of LogMessagePart as two kinds:
+// 1. raw print text message, and
+// 2. interpolated expression for evaluation which is inside matching curly
+//    braces.
+//
+// The function tries to parse logMessage into a list of LogMessageParts
+// for easy later access in BreakpointHitCallback.
+void SourceBreakpoint::SetLogMessage() {
+  logMessageParts.clear();
+
+  // Contains unmatched open curly braces indices.
+  std::vector<int> unmatched_curly_braces;
+
+  // Contains all matched curly braces in logMessage.
+  // Loop invariant: matched_curly_braces_ranges are sorted by start index in
+  // ascending order without any overlap between them.
+  std::vector<std::pair<int, int>> matched_curly_braces_ranges;
+
+  lldb::SBError error;
+  // Part1 - parse matched_curly_braces_ranges.
+  // locating all curly braced expression ranges in logMessage.
+  // The algorithm takes care of nested and imbalanced curly braces.
+  for (size_t i = 0; i < logMessage.size(); ++i) {
+    if (logMessage[i] == '{') {
+      unmatched_curly_braces.push_back(i);
+    } else if (logMessage[i] == '}') {
+      if (unmatched_curly_braces.empty())
+        // Nothing to match.
+        continue;
+
+      int last_unmatched_index = unmatched_curly_braces.back();
+      unmatched_curly_braces.pop_back();
+
+      // Erase any matched ranges included in the new match.
+      while (!matched_curly_braces_ranges.empty()) {
+        assert(matched_curly_braces_ranges.back().first !=
+                   last_unmatched_index &&
+               "How can a curley brace be matched twice?");
+        if (matched_curly_braces_ranges.back().first < last_unmatched_index)
+          break;
+
+        // This is a nested range let's earse it.
+        assert((size_t)matched_curly_braces_ranges.back().second < i);
+        matched_curly_braces_ranges.pop_back();
+      }
+
+      // Assert invariant.
+      assert(matched_curly_braces_ranges.empty() ||
+             matched_curly_braces_ranges.back().first < last_unmatched_index);
+      matched_curly_braces_ranges.emplace_back(last_unmatched_index, i);
+    }
+  }
+
+  // Part2 - parse raw text and expresions parts.
+  // All expression ranges have been parsed in matched_curly_braces_ranges.
+  // The code below uses matched_curly_braces_ranges to divide logMessage
+  // into raw text parts and expression parts.
+  int last_raw_text_start = 0;
+  for (const std::pair<int, int> &curly_braces_range :
+       matched_curly_braces_ranges) {
+    // Raw text before open curly brace.
+    assert(curly_braces_range.first >= last_raw_text_start);
+    size_t raw_text_len = curly_braces_range.first - last_raw_text_start;
+    if (raw_text_len > 0) {
+      error = AppendLogMessagePart(
+          llvm::StringRef(logMessage.c_str() + last_raw_text_start,
+                          raw_text_len),
+          /*is_expr=*/false);
+      if (error.Fail()) {
+        NotifyLogMessageError(error.GetCString());
+        return;
+      }
+    }
+
+    // Expression between curly braces.
+    assert(curly_braces_range.second > curly_braces_range.first);
+    size_t expr_len = curly_braces_range.second - curly_braces_range.first - 1;
+    error = AppendLogMessagePart(
+        llvm::StringRef(logMessage.c_str() + curly_braces_range.first + 1,
+                        expr_len),
+        /*is_expr=*/true);
+    if (error.Fail()) {
+      NotifyLogMessageError(error.GetCString());
+      return;
+    }
+
+    last_raw_text_start = curly_braces_range.second + 1;
+  }
+  // Trailing raw text after close curly brace.
+  assert(last_raw_text_start >= 0);
+  if (logMessage.size() > (size_t)last_raw_text_start) {
+    error = AppendLogMessagePart(
+        llvm::StringRef(logMessage.c_str() + last_raw_text_start,
+                        logMessage.size() - last_raw_text_start),
+        /*is_expr=*/false);
+    if (error.Fail()) {
+      NotifyLogMessageError(error.GetCString());
+      return;
+    }
+  }
+
+  bp.SetCallback(BreakpointHitCallback, this);
+}
+
+void SourceBreakpoint::NotifyLogMessageError(llvm::StringRef error) {
+  std::string message = "Log message has error: ";
+  message += error;
+  g_dap.SendOutput(OutputType::Console, message);
+}
+
+/*static*/
+bool SourceBreakpoint::BreakpointHitCallback(
+    void *baton, lldb::SBProcess &process, lldb::SBThread &thread,
+    lldb::SBBreakpointLocation &location) {
+  if (!baton)
+    return true;
+
+  SourceBreakpoint *bp = (SourceBreakpoint *)baton;
+  lldb::SBFrame frame = thread.GetSelectedFrame();
+
+  std::string output;
+  for (const SourceBreakpoint::LogMessagePart &messagePart :
+       bp->logMessageParts) {
+    if (messagePart.is_expr) {
+      // Try local frame variables first before fall back to expression
+      // evaluation
+      const std::string &expr_str = messagePart.text;
+      const char *expr = expr_str.c_str();
+      lldb::SBValue value =
+          frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
+      if (value.GetError().Fail())
+        value = frame.EvaluateExpression(expr);
+      output += VariableDescription(value).display_value;
+    } else {
+      output += messagePart.text;
+    }
+  }
+  if (!output.empty() && output.back() != '\n')
+    output.push_back('\n'); // Ensure log message has line break.
+  g_dap.SendOutput(OutputType::Console, output.c_str());
+
+  // Do not stop.
+  return false;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.h b/lldb/tools/lldb-dap/SourceBreakpoint.h
index f4b54a44fc68..aa3fbe6d0f96 100644
--- a/lldb/tools/lldb-dap/SourceBreakpoint.h
+++ b/lldb/tools/lldb-dap/SourceBreakpoint.h
@@ -9,21 +9,45 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H
 #define LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H
 
-#include "BreakpointBase.h"
+#include "Breakpoint.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace lldb_dap {
 
-struct SourceBreakpoint : public BreakpointBase {
+struct SourceBreakpoint : public Breakpoint {
+  // logMessage part can be either a raw text or an expression.
+  struct LogMessagePart {
+    LogMessagePart(llvm::StringRef text, bool is_expr)
+        : text(text), is_expr(is_expr) {}
+    std::string text;
+    bool is_expr;
+  };
+  // If this attribute exists and is non-empty, the backend must not 'break'
+  // (stop) but log the message instead. Expressions within {} are
+  // interpolated.
+  std::string logMessage;
+  std::vector<LogMessagePart> logMessageParts;
 
   uint32_t line;   ///< The source line of the breakpoint or logpoint
   uint32_t column; ///< An optional source column of the breakpoint
 
-  SourceBreakpoint() : BreakpointBase(), line(0), column(0) {}
+  SourceBreakpoint() : Breakpoint(), line(0), column(0) {}
   SourceBreakpoint(const llvm::json::Object &obj);
 
   // Set this breakpoint in LLDB as a new breakpoint
   void SetBreakpoint(const llvm::StringRef source_path);
+  void UpdateBreakpoint(const SourceBreakpoint &request_bp);
+
+  void SetLogMessage();
+  // Format \param text and return formatted text in \param formatted.
+  // \return any formatting failures.
+  lldb::SBError FormatLogText(llvm::StringRef text, std::string &formatted);
+  lldb::SBError AppendLogMessagePart(llvm::StringRef part, bool is_expr);
+  void NotifyLogMessageError(llvm::StringRef error);
+
+  static bool BreakpointHitCallback(void *baton, lldb::SBProcess &process,
+                                    lldb::SBThread &thread,
+                                    lldb::SBBreakpointLocation &location);
 };
 
 inline bool operator<(const SourceBreakpoint &lhs,
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 01494dcc7da0..67022347e6d6 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -525,7 +525,8 @@ void EventThreadFunction() {
         if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) {
           auto event_type =
               lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
-          auto bp = lldb::SBBreakpoint::GetBreakpointFromEvent(event);
+          auto bp =
+              Breakpoint(lldb::SBBreakpoint::GetBreakpointFromEvent(event));
           // If the breakpoint was originated from the IDE, it will have the
           // BreakpointBase::GetBreakpointLabel() label attached. Regardless
           // of wether the locations were added or removed, the breakpoint
@@ -541,7 +542,7 @@ void EventThreadFunction() {
             // mapped. Note that CreateBreakpoint doesn't apply source mapping.
             // Besides, the current implementation of VSCode ignores the
             // "source" element of breakpoint events.
-            llvm::json::Value source_bp = CreateBreakpoint(bp);
+            llvm::json::Value source_bp = CreateBreakpoint(&bp);
             source_bp.getAsObject()->erase("source");
 
             body.try_emplace("breakpoint", source_bp);
@@ -2345,7 +2346,7 @@ void request_setBreakpoints(const llvm::json::Object &request) {
               existing_source_bps->second.find(src_bp.line);
           if (existing_bp != existing_source_bps->second.end()) {
             existing_bp->second.UpdateBreakpoint(src_bp);
-            AppendBreakpoint(existing_bp->second.bp, response_breakpoints, path,
+            AppendBreakpoint(&existing_bp->second, response_breakpoints, path,
                              src_bp.line);
             continue;
           }
@@ -2354,7 +2355,7 @@ void request_setBreakpoints(const llvm::json::Object &request) {
         g_dap.source_breakpoints[path][src_bp.line] = src_bp;
         SourceBreakpoint &new_bp = g_dap.source_breakpoints[path][src_bp.line];
         new_bp.SetBreakpoint(path.data());
-        AppendBreakpoint(new_bp.bp, response_breakpoints, path, new_bp.line);
+        AppendBreakpoint(&new_bp, response_breakpoints, path, new_bp.line);
       }
     }
   }
@@ -2567,7 +2568,7 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
       // handled it here and we don't need to set a new breakpoint below.
       request_bps.erase(request_pos);
       // Add this breakpoint info to the response
-      AppendBreakpoint(pair.second.bp, response_breakpoints);
+      AppendBreakpoint(&pair.second, response_breakpoints);
     }
   }
   // Remove any breakpoints that are no longer in our list
@@ -2581,7 +2582,7 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
     g_dap.function_breakpoints[pair.first()] = std::move(pair.second);
     FunctionBreakpoint &new_bp = g_dap.function_breakpoints[pair.first()];
     new_bp.SetBreakpoint();
-    AppendBreakpoint(new_bp.bp, response_breakpoints);
+    AppendBreakpoint(&new_bp, response_breakpoints);
   }
 
   llvm::json::Object body;
@@ -3582,8 +3583,8 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
   FillResponse(request, response);
   llvm::json::Array response_breakpoints;
   for (uint32_t i = 0; g_dap.target.GetBreakpointAtIndex(i).IsValid(); ++i) {
-    auto bp = g_dap.target.GetBreakpointAtIndex(i);
-    AppendBreakpoint(bp, response_breakpoints);
+    auto bp = Breakpoint(g_dap.target.GetBreakpointAtIndex(i));
+    AppendBreakpoint(&bp, response_breakpoints);
   }
   llvm::json::Object body;
   body.try_emplace("breakpoints", std::move(response_breakpoints));
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 485c76b8bb93..81f2753a4edd 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -653,9 +653,6 @@ option(LLVM_USE_OPROFILE
 option(LLVM_EXTERNALIZE_DEBUGINFO
   "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
 
-option(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS
-  "Add extra Booleans to ilist_iterators to communicate facts for debug-info" OFF)
-
 set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING
   "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)")
 
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 0699a8586fcc..486df22c2c1b 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -140,10 +140,6 @@ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
   endif()
 endif()
 
-if(LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS)
-  add_compile_definitions(EXPERIMENTAL_DEBUGINFO_ITERATORS)
-endif()
-
 if (LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS)
   add_compile_definitions(STRICT_FIXED_SIZE_VECTORS)
 endif()
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b2417143ca0..8c126906fbfd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -357,12 +357,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                                                         Add product
                                                                                                         names.
 
-     ``gfx90a``                  ``amdgcn``   dGPU  - sramecc         - Absolute      - *rocm-amdhsa* *TBA*
-                                                    - tgsplit           flat
-                                                    - xnack             scratch                       .. TODO::
+     ``gfx90a``                  ``amdgcn``   dGPU  - sramecc         - Absolute      - *rocm-amdhsa* - AMD Instinct MI210 Accelerator
+                                                    - tgsplit           flat          - *rocm-amdhsa* - AMD Instinct MI250 Accelerator
+                                                    - xnack             scratch       - *rocm-amdhsa* - AMD Instinct MI250X Accelerator
                                                     - kernarg preload - Packed
-                                                                        work-item                       Add product
-                                                                        IDs                             names.
+                                                                        work-item
+                                                                        IDs
 
      ``gfx90c``                  ``amdgcn``   APU   - xnack           - Absolute      - *pal-amdpal*  - Ryzen 7 4700G
                                                                         flat                          - Ryzen 7 4700GE
@@ -520,6 +520,108 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
 
      =========== =============== ============ ===== ================= =============== =============== ======================
 
+Generic processors allow execution of a single code object on any of the processors that
+it supports. Such code objects may not perform as well as those for the non-generic processors.
+
+Generic processors are only available on code object V6 and above (see :ref:`amdgpu-elf-code-object`).
+
+Generic processor code objects are versioned (see :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`) between 1 and 255.
+The version of non-generic code objects is always set to 0.
+
+For a generic code object, adding a new supported processor may require the code generated for the generic target to be changed
+so it can continue to execute on the previously supported processors as well as on the new one.
+When this happens, the generic code object version number is incremented at the same time as the generic target is updated.
+
+Each supported processor of a generic target is mapped to the version it was introduced in.
+A generic code object can execute on a supported processor if the version of the code object being loaded is
+greater than or equal to the version in which the processor was added to the generic target.
+
+  .. table:: AMDGPU Generic Processors
+     :name: amdgpu-generic-processor-table
+
+     ==================== ============== ================= ================== ================= =================================
+     Processor             Target        Supported         Target Features    Target Properties Target Restrictions
+                           Triple        Processors        Supported
+                           Architecture
+
+     ==================== ============== ================= ================== ================= =================================
+     ``gfx9-generic``     ``amdgcn``     - ``gfx900``      - xnack            - Absolute flat   - ``v_mad_mix`` instructions
+                                         - ``gfx902``                           scratch           are not available on
+                                         - ``gfx904``                                             ``gfx900``, ``gfx902``,
+                                         - ``gfx906``                                             ``gfx909``, ``gfx90c``
+                                         - ``gfx909``                                           - ``v_fma_mix`` instructions
+                                         - ``gfx90c``                                             are not available on ``gfx904``
+                                                                                                - sramecc is not available on
+                                                                                                  ``gfx906``
+                                                                                                - The following instructions
+                                                                                                  are not available on ``gfx906``:
+
+                                                                                                  - ``v_fmac_f32``
+                                                                                                  - ``v_xnor_b32``
+                                                                                                  - ``v_dot4_i32_i8``
+                                                                                                  - ``v_dot8_i32_i4``
+                                                                                                  - ``v_dot2_i32_i16``
+                                                                                                  - ``v_dot2_u32_u16``
+                                                                                                  - ``v_dot4_u32_u8``
+                                                                                                  - ``v_dot8_u32_u4``
+                                                                                                  - ``v_dot2_f32_f16``
+
+
+     ``gfx10.1-generic``  ``amdgcn``     - ``gfx1010``     - xnack            - Absolute flat   - The following instructions are
+                                         - ``gfx1011``     - wavefrontsize64    scratch           not available on ``gfx1011``
+                                         - ``gfx1012``     - cumode                               and ``gfx1012``
+                                         - ``gfx1013``
+                                                                                                  - ``v_dot4_i32_i8``
+                                                                                                  - ``v_dot8_i32_i4``
+                                                                                                  - ``v_dot2_i32_i16``
+                                                                                                  - ``v_dot2_u32_u16``
+                                                                                                  - ``v_dot2c_f32_f16``
+                                                                                                  - ``v_dot4c_i32_i8``
+                                                                                                  - ``v_dot4_u32_u8``
+                                                                                                  - ``v_dot8_u32_u4``
+                                                                                                  - ``v_dot2_f32_f16``
+
+                                                                                                - BVH Ray Tracing instructions
+                                                                                                  are not available on
+                                                                                                  ``gfx1013``
+
+
+     ``gfx10.3-generic``  ``amdgcn``     - ``gfx1030``     - wavefrontsize64  - Absolute flat   No restrictions.
+                                         - ``gfx1031``     - cumode             scratch
+                                         - ``gfx1032``
+                                         - ``gfx1033``
+                                         - ``gfx1034``
+                                         - ``gfx1035``
+                                         - ``gfx1036``
+
+
+     ``gfx11-generic``    ``amdgcn``     - ``gfx1100``     - wavefrontsize64  - Architected     Various codegen pessimizations
+                                         - ``gfx1101``     - cumode             flat scratch    are applied to work around some
+                                         - ``gfx1102``                        - Packed          hazards specific to some targets
+                                         - ``gfx1103``                          work-item       within this family.
+                                         - ``gfx1150``                          IDs
+                                         - ``gfx1151``                                          Not all VGPRs can be used on:
+
+                                                                                                - ``gfx1100``
+                                                                                                - ``gfx1101``
+                                                                                                - ``gfx1151``
+
+                                                                                                SALU floating point instructions
+                                                                                                and single-use VGPR hint
+                                                                                                instructions are not available
+                                                                                                on:
+
+                                                                                                - ``gfx1150``
+                                                                                                - ``gfx1151``
+
+                                                                                                SGPRs are not supported for src1
+                                                                                                in dpp instructions for:
+
+                                                                                                - ``gfx1150``
+                                                                                                - ``gfx1151``
+     ==================== ============== ================= ================== ================= =================================
+
+
 .. _amdgpu-target-features:
 
 Target Features
@@ -533,7 +635,7 @@ generating the code. A mismatch of features may result in incorrect
 execution, or a reduction in performance.
 
 The target features supported by each processor is listed in
-:ref:`amdgpu-processor-table`.
+:ref:`amdgpu-processors`.
 
 Target features are controlled by exactly one of the following Clang
 options:
@@ -1443,6 +1545,7 @@ The AMDGPU backend uses the following ELF header:
                                 - ``ELFABIVERSION_AMDGPU_HSA_V3``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V4``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V5``
+                                - ``ELFABIVERSION_AMDGPU_HSA_V6``
                                 - ``ELFABIVERSION_AMDGPU_PAL``
                                 - ``ELFABIVERSION_AMDGPU_MESA3D``
      ``e_type``                 - ``ET_REL``
@@ -1451,7 +1554,8 @@ The AMDGPU backend uses the following ELF header:
      ``e_entry``                0
      ``e_flags``                See :ref:`amdgpu-elf-header-e_flags-v2-table`,
                                 :ref:`amdgpu-elf-header-e_flags-table-v3`,
-                                and :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`
+                                :ref:`amdgpu-elf-header-e_flags-table-v4-v5`,
+                                and :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`
      ========================== ===============================
 
 ..
@@ -1471,6 +1575,7 @@ The AMDGPU backend uses the following ELF header:
      ``ELFABIVERSION_AMDGPU_HSA_V3`` 1
      ``ELFABIVERSION_AMDGPU_HSA_V4`` 2
      ``ELFABIVERSION_AMDGPU_HSA_V5`` 3
+     ``ELFABIVERSION_AMDGPU_HSA_V6`` 4
      ``ELFABIVERSION_AMDGPU_PAL``    0
      ``ELFABIVERSION_AMDGPU_MESA3D`` 0
      =============================== =====
@@ -1517,6 +1622,10 @@ The AMDGPU backend uses the following ELF header:
     ``-mcode-object-version=5``. This is the default code object
     version if not specified.
 
+  * ``ELFABIVERSION_AMDGPU_HSA_V6`` is used to specify the version of AMD HSA
+    runtime ABI for code object V6. Specify using the Clang option
+    ``-mcode-object-version=6``.
+
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
     runtime ABI.
 
@@ -1543,8 +1652,9 @@ The AMDGPU backend uses the following ELF header:
   ``NT_AMD_HSA_ISA_VERSION`` note record for code object V2 (see
   :ref:`amdgpu-note-records-v2`) and in the ``EF_AMDGPU_MACH`` bit field of the
   ``e_flags`` for code object V3 and above (see
-  :ref:`amdgpu-elf-header-e_flags-table-v3` and
-  :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`).
+  :ref:`amdgpu-elf-header-e_flags-table-v3`,
+  :ref:`amdgpu-elf-header-e_flags-table-v4-v5` and
+  :ref:`amdgpu-elf-header-e_flags-table-v6-onwards`).
 
 ``e_entry``
   The entry point is 0 as the entry points for individual kernels must be
@@ -1615,8 +1725,8 @@ The AMDGPU backend uses the following ELF header:
                                              :ref:`amdgpu-target-features`.
      ================================= ===== =============================
 
-  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4 and After
-     :name: amdgpu-elf-header-e_flags-table-v4-onwards
+  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4 and V5
+     :name: amdgpu-elf-header-e_flags-table-v4-v5
 
      ============================================ ===== ===================================
      Name                                         Value      Description
@@ -1642,80 +1752,119 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_FEATURE_SRAMECC_ON_V4``          0xc00 SRAMECC enabled.
      ============================================ ===== ===================================
 
+  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V6 and After
+     :name: amdgpu-elf-header-e_flags-table-v6-onwards
+
+     ============================================ ========== =========================================
+     Name                                         Value      Description
+     ============================================ ========== =========================================
+     ``EF_AMDGPU_MACH``                           0x0ff      AMDGPU processor selection
+                                                             mask for
+                                                             ``EF_AMDGPU_MACH_xxx`` values
+                                                             defined in
+                                                             :ref:`amdgpu-ef-amdgpu-mach-table`.
+     ``EF_AMDGPU_FEATURE_XNACK_V4``               0x300      XNACK selection mask for
+                                                             ``EF_AMDGPU_FEATURE_XNACK_*_V4``
+                                                             values.
+     ``EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4``   0x000      XNACK unsupported.
+     ``EF_AMDGPU_FEATURE_XNACK_ANY_V4``           0x100      XNACK can have any value.
+     ``EF_AMDGPU_FEATURE_XNACK_OFF_V4``           0x200      XNACK disabled.
+     ``EF_AMDGPU_FEATURE_XNACK_ON_V4``            0x300      XNACK enabled.
+     ``EF_AMDGPU_FEATURE_SRAMECC_V4``             0xc00      SRAMECC selection mask for
+                                                             ``EF_AMDGPU_FEATURE_SRAMECC_*_V4``
+                                                             values.
+     ``EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4`` 0x000      SRAMECC unsupported.
+     ``EF_AMDGPU_FEATURE_SRAMECC_ANY_V4``         0x400      SRAMECC can have any value.
+     ``EF_AMDGPU_FEATURE_SRAMECC_OFF_V4``         0x800      SRAMECC disabled,
+     ``EF_AMDGPU_FEATURE_SRAMECC_ON_V4``          0xc00      SRAMECC enabled.
+     ``EF_AMDGPU_GENERIC_VERSION_V``              0xff000000 Generic code object version selection
+                                                             mask. This is a value between 1 and 255,
+                                                             stored in the most significant byte
+                                                             of EFLAGS.
+                                                             See :ref:`amdgpu-generic-processor-table`
+     ============================================ ========== =========================================
+
   .. table:: AMDGPU ``EF_AMDGPU_MACH`` Values
      :name: amdgpu-ef-amdgpu-mach-table
 
-     ==================================== ========== =============================
-     Name                                 Value      Description (see
-                                                     :ref:`amdgpu-processor-table`)
-     ==================================== ========== =============================
-     ``EF_AMDGPU_MACH_NONE``              0x000      *not specified*
-     ``EF_AMDGPU_MACH_R600_R600``         0x001      ``r600``
-     ``EF_AMDGPU_MACH_R600_R630``         0x002      ``r630``
-     ``EF_AMDGPU_MACH_R600_RS880``        0x003      ``rs880``
-     ``EF_AMDGPU_MACH_R600_RV670``        0x004      ``rv670``
-     ``EF_AMDGPU_MACH_R600_RV710``        0x005      ``rv710``
-     ``EF_AMDGPU_MACH_R600_RV730``        0x006      ``rv730``
-     ``EF_AMDGPU_MACH_R600_RV770``        0x007      ``rv770``
-     ``EF_AMDGPU_MACH_R600_CEDAR``        0x008      ``cedar``
-     ``EF_AMDGPU_MACH_R600_CYPRESS``      0x009      ``cypress``
-     ``EF_AMDGPU_MACH_R600_JUNIPER``      0x00a      ``juniper``
-     ``EF_AMDGPU_MACH_R600_REDWOOD``      0x00b      ``redwood``
-     ``EF_AMDGPU_MACH_R600_SUMO``         0x00c      ``sumo``
-     ``EF_AMDGPU_MACH_R600_BARTS``        0x00d      ``barts``
-     ``EF_AMDGPU_MACH_R600_CAICOS``       0x00e      ``caicos``
-     ``EF_AMDGPU_MACH_R600_CAYMAN``       0x00f      ``cayman``
-     ``EF_AMDGPU_MACH_R600_TURKS``        0x010      ``turks``
-     *reserved*                           0x011 -    Reserved for ``r600``
-                                          0x01f      architecture processors.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX600``     0x020      ``gfx600``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX601``     0x021      ``gfx601``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX700``     0x022      ``gfx700``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX701``     0x023      ``gfx701``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX702``     0x024      ``gfx702``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX703``     0x025      ``gfx703``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX704``     0x026      ``gfx704``
-     *reserved*                           0x027      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX801``     0x028      ``gfx801``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX802``     0x029      ``gfx802``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX803``     0x02a      ``gfx803``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX810``     0x02b      ``gfx810``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX900``     0x02c      ``gfx900``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX902``     0x02d      ``gfx902``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX904``     0x02e      ``gfx904``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX906``     0x02f      ``gfx906``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX908``     0x030      ``gfx908``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX909``     0x031      ``gfx909``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX90C``     0x032      ``gfx90c``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1010``    0x033      ``gfx1010``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1011``    0x034      ``gfx1011``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1012``    0x035      ``gfx1012``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1030``    0x036      ``gfx1030``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1031``    0x037      ``gfx1031``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1032``    0x038      ``gfx1032``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1033``    0x039      ``gfx1033``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX602``     0x03a      ``gfx602``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX705``     0x03b      ``gfx705``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX805``     0x03c      ``gfx805``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1035``    0x03d      ``gfx1035``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1034``    0x03e      ``gfx1034``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX90A``     0x03f      ``gfx90a``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX940``     0x040      ``gfx940``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1100``    0x041      ``gfx1100``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1013``    0x042      ``gfx1013``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1150``    0x043      ``gfx1150``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1103``    0x044      ``gfx1103``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1036``    0x045      ``gfx1036``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1101``    0x046      ``gfx1101``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1102``    0x047      ``gfx1102``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1200``    0x048      ``gfx1200``
-     *reserved*                           0x049      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1151``    0x04a      ``gfx1151``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX941``     0x04b      ``gfx941``
-     ``EF_AMDGPU_MACH_AMDGCN_GFX942``     0x04c      ``gfx942``
-     *reserved*                           0x04d      Reserved.
-     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``    0x04e      ``gfx1201``
-     ==================================== ========== =============================
+     ========================================== ========== =============================
+     Name                                       Value      Description (see
+                                                           :ref:`amdgpu-processor-table`)
+     ========================================== ========== =============================
+     ``EF_AMDGPU_MACH_NONE``                    0x000      *not specified*
+     ``EF_AMDGPU_MACH_R600_R600``               0x001      ``r600``
+     ``EF_AMDGPU_MACH_R600_R630``               0x002      ``r630``
+     ``EF_AMDGPU_MACH_R600_RS880``              0x003      ``rs880``
+     ``EF_AMDGPU_MACH_R600_RV670``              0x004      ``rv670``
+     ``EF_AMDGPU_MACH_R600_RV710``              0x005      ``rv710``
+     ``EF_AMDGPU_MACH_R600_RV730``              0x006      ``rv730``
+     ``EF_AMDGPU_MACH_R600_RV770``              0x007      ``rv770``
+     ``EF_AMDGPU_MACH_R600_CEDAR``              0x008      ``cedar``
+     ``EF_AMDGPU_MACH_R600_CYPRESS``            0x009      ``cypress``
+     ``EF_AMDGPU_MACH_R600_JUNIPER``            0x00a      ``juniper``
+     ``EF_AMDGPU_MACH_R600_REDWOOD``            0x00b      ``redwood``
+     ``EF_AMDGPU_MACH_R600_SUMO``               0x00c      ``sumo``
+     ``EF_AMDGPU_MACH_R600_BARTS``              0x00d      ``barts``
+     ``EF_AMDGPU_MACH_R600_CAICOS``             0x00e      ``caicos``
+     ``EF_AMDGPU_MACH_R600_CAYMAN``             0x00f      ``cayman``
+     ``EF_AMDGPU_MACH_R600_TURKS``              0x010      ``turks``
+     *reserved*                                 0x011 -    Reserved for ``r600``
+                                                0x01f      architecture processors.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX600``           0x020      ``gfx600``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX601``           0x021      ``gfx601``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX700``           0x022      ``gfx700``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX701``           0x023      ``gfx701``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX702``           0x024      ``gfx702``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX703``           0x025      ``gfx703``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX704``           0x026      ``gfx704``
+     *reserved*                                 0x027      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX801``           0x028      ``gfx801``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX802``           0x029      ``gfx802``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX803``           0x02a      ``gfx803``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX810``           0x02b      ``gfx810``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX900``           0x02c      ``gfx900``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX902``           0x02d      ``gfx902``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX904``           0x02e      ``gfx904``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX906``           0x02f      ``gfx906``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX908``           0x030      ``gfx908``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX909``           0x031      ``gfx909``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX90C``           0x032      ``gfx90c``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1010``          0x033      ``gfx1010``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1011``          0x034      ``gfx1011``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1012``          0x035      ``gfx1012``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1030``          0x036      ``gfx1030``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1031``          0x037      ``gfx1031``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1032``          0x038      ``gfx1032``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1033``          0x039      ``gfx1033``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX602``           0x03a      ``gfx602``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX705``           0x03b      ``gfx705``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX805``           0x03c      ``gfx805``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1035``          0x03d      ``gfx1035``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1034``          0x03e      ``gfx1034``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX90A``           0x03f      ``gfx90a``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX940``           0x040      ``gfx940``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1100``          0x041      ``gfx1100``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1013``          0x042      ``gfx1013``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1150``          0x043      ``gfx1150``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1103``          0x044      ``gfx1103``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1036``          0x045      ``gfx1036``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1101``          0x046      ``gfx1101``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1102``          0x047      ``gfx1102``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1200``          0x048      ``gfx1200``
+     *reserved*                                 0x049      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1151``          0x04a      ``gfx1151``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX941``           0x04b      ``gfx941``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
+     *reserved*                                 0x04d      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
+     *reserved*                                 0x04f      Reserved.
+     *reserved*                                 0x050      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC``     0x051      ``gfx9-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC``  0x052      ``gfx10.1-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC``  0x053      ``gfx10.3-generic``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC``    0x054      ``gfx11-generic``
+     *reserved*                                 0x055      Reserved.
+     ========================================== ========== =============================
 
 Sections
 --------
@@ -5366,7 +5515,10 @@ additional 256 bytes to the kernel_code_entry_byte_offset. This addition
 facilitates the incorporation of a prologue to the kernel entry to handle cases
 where code designed for kernarg preloading is executed on hardware equipped with
 incompatible firmware. If hardware has compatible firmware the 256 bytes at the
-start of the kernel entry will be skipped.
+start of the kernel entry will be skipped. Additionally, the compiler backend
+may insert a trap instruction at the start of the kernel prologue to manage
+situations where kernarg preloading is attempted on hardware with incompatible
+firmware.
 
 .. _amdgpu-amdhsa-kernel-prolog:
 
@@ -12139,8 +12291,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              before invalidating
                                                              the caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12169,8 +12321,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              before invalidating
                                                              the caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12276,8 +12428,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12307,8 +12459,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         3. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         3. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -12503,8 +12655,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              the
                                                              fence-paired-atomic.
 
-                                                         2. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         2. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before any
                                                              following global/generic
@@ -13217,8 +13369,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         4. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         4. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -13292,8 +13444,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              invalidating the
                                                              caches.
 
-                                                         4. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         4. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
@@ -13520,8 +13672,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              requirements of
                                                              release.
 
-                                                         2. buffer_gl0_inv;
-                                                            buffer_gl1_inv
+                                                         2. buffer_gl1_inv;
+                                                            buffer_gl0_inv
 
                                                            - Must happen before
                                                              any following
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 13d1912ceb2a..abef4f810314 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -178,6 +178,8 @@ variable and type on the CMake command line:
 
   $ cmake -DVARIABLE:TYPE=value path/to/llvm/source
 
+.. _cmake_frequently_used_variables:
+
 Frequently-used CMake variables
 -------------------------------
 
@@ -275,7 +277,7 @@ manual, or execute ``cmake --help-variable VARIABLE_NAME``.
 
 **CMAKE_CXX_STANDARD**:STRING
   Sets the C++ standard to conform to when building LLVM.  Possible values are
-  17 and 20.  LLVM Requires C++ 17 or higher.  This defaults to 17.
+  17 and 20.  LLVM Requires C++17 or higher.  This defaults to 17.
 
 **CMAKE_INSTALL_BINDIR**:PATH
   The path to install executables, relative to the *CMAKE_INSTALL_PREFIX*.
diff --git a/llvm/docs/CodeReview.rst b/llvm/docs/CodeReview.rst
index f1d5b6c9c992..e3a74946e593 100644
--- a/llvm/docs/CodeReview.rst
+++ b/llvm/docs/CodeReview.rst
@@ -103,6 +103,12 @@ ready to be committed. Specifically, once a patch is sent out for review, it
 needs an explicit approval before it is committed. Do not assume silent
 approval, or solicit objections to a patch with a deadline.
 
+.. note::
+   If you are using a Pull Request for purposes other than review
+   (eg: precommit CI results, convenient web-based reverts, etc)
+   `skip-precommit-approval<https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`
+   label to the PR.
+
 Acknowledge All Reviewer Feedback
 ---------------------------------
 
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 42d11fa17cca..b823be9e8282 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -544,8 +544,13 @@ options. For GNU :program:`objcopy` compatibility, the values are all bfdnames.
 - `elf32-sparc`
 - `elf32-sparcel`
 
-Additionally, all targets except `binary` and `ihex` can have `-freebsd` as a
-suffix.
+The following formats are suppoprted by :program:`llvm-objcopy` for the
+:option:`--output-target` only:
+
+- `srec`
+
+Additionally, all targets except `binary`, `ihex`, and `srec` can have
+`-freebsd` as a suffix.
 
 BINARY INPUT AND OUTPUT
 -----------------------
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 316fc6ad86b8..687d1f29b5a1 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -540,75 +540,23 @@ Variables are passed to ``cmake`` on the command line using the format
 ``-D<variable name>=<value>``. The following variables are some common options
 used by people developing LLVM.
 
-+-------------------------+----------------------------------------------------+
-| Variable                | Purpose                                            |
-+=========================+====================================================+
-| CMAKE_C_COMPILER        | Tells ``cmake`` which C compiler to use. By        |
-|                         | default, this will be /usr/bin/cc.                 |
-+-------------------------+----------------------------------------------------+
-| CMAKE_CXX_COMPILER      | Tells ``cmake`` which C++ compiler to use. By      |
-|                         | default, this will be /usr/bin/c++.                |
-+-------------------------+----------------------------------------------------+
-| CMAKE_BUILD_TYPE        | Tells ``cmake`` what type of build you are trying  |
-|                         | to generate files for. Valid options are Debug,    |
-|                         | Release, RelWithDebInfo, and MinSizeRel. Default   |
-|                         | is Debug.                                          |
-+-------------------------+----------------------------------------------------+
-| CMAKE_INSTALL_PREFIX    | Specifies the install directory to target when     |
-|                         | running the install action of the build files.     |
-+-------------------------+----------------------------------------------------+
-| Python3_EXECUTABLE      | Forces CMake to use a specific Python version by   |
-|                         | passing a path to a Python interpreter. By default |
-|                         | the Python version of the interpreter in your PATH |
-|                         | is used.                                           |
-+-------------------------+----------------------------------------------------+
-| LLVM_TARGETS_TO_BUILD   | A semicolon delimited list controlling which       |
-|                         | targets will be built and linked into llvm.        |
-|                         | The default list is defined as                     |
-|                         | ``LLVM_ALL_TARGETS``, and can be set to include    |
-|                         | out-of-tree targets. The default value includes:   |
-|                         | ``AArch64, AMDGPU, ARM, AVR, BPF, Hexagon, Lanai,  |
-|                         | Mips, MSP430, NVPTX, PowerPC, RISCV, Sparc,        |
-|                         | SystemZ, WebAssembly, X86, XCore``. Setting this   |
-|                         | to ``"host"`` will only compile the host           |
-|                         | architecture (e.g. equivalent to specifying ``X86``|
-|                         | on an x86 host machine) can                        |
-|                         | significantly speed up compile and test times.     |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_DOXYGEN     | Build doxygen-based documentation from the source  |
-|                         | code This is disabled by default because it is     |
-|                         | slow and generates a lot of output.                |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_PROJECTS    | A semicolon-delimited list selecting which of the  |
-|                         | other LLVM subprojects to additionally build. (Only|
-|                         | effective when using a side-by-side project layout |
-|                         | e.g. via git). The default list is empty. Can      |
-|                         | include: clang, clang-tools-extra,                 |
-|                         | cross-project-tests, flang, libc, libclc, lld,     |
-|                         | lldb, mlir, openmp, polly, or pstl.                |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_RUNTIMES    | A semicolon-delimited list selecting which of the  |
-|                         | runtimes to build. (Only effective when using the  |
-|                         | full monorepo layout). The default list is empty.  |
-|                         | Can include: compiler-rt, libc, libcxx, libcxxabi, |
-|                         | libunwind, or openmp.                              |
-+-------------------------+----------------------------------------------------+
-| LLVM_ENABLE_SPHINX      | Build sphinx-based documentation from the source   |
-|                         | code. This is disabled by default because it is    |
-|                         | slow and generates a lot of output. Sphinx version |
-|                         | 1.5 or later recommended.                          |
-+-------------------------+----------------------------------------------------+
-| LLVM_BUILD_LLVM_DYLIB   | Generate libLLVM.so. This library contains a       |
-|                         | default set of LLVM components that can be         |
-|                         | overridden with ``LLVM_DYLIB_COMPONENTS``. The     |
-|                         | default contains most of LLVM and is defined in    |
-|                         | ``tools/llvm-shlib/CMakelists.txt``. This option is|
-|                         | not available on Windows.                          |
-+-------------------------+----------------------------------------------------+
-| LLVM_OPTIMIZED_TABLEGEN | Builds a release tablegen that gets used during    |
-|                         | the LLVM build. This can dramatically speed up     |
-|                         | debug builds.                                      |
-+-------------------------+----------------------------------------------------+
+* ``CMAKE_C_COMPILER``
+* ``CMAKE_CXX_COMPILER``
+* ``CMAKE_BUILD_TYPE``
+* ``CMAKE_INSTALL_PREFIX``
+* ``Python3_EXECUTABLE``
+* ``LLVM_TARGETS_TO_BUILD``
+* ``LLVM_ENABLE_PROJECTS``
+* ``LLVM_ENABLE_RUNTIMES``
+* ``LLVM_ENABLE_DOXYGEN``
+* ``LLVM_ENABLE_SPHINX``
+* ``LLVM_BUILD_LLVM_DYLIB``
+* ``LLVM_LINK_LLVM_DYLIB``
+* ``LLVM_PARALLEL_LINK_JOBS``
+* ``LLVM_OPTIMIZED_TABLEGEN``
+
+See :ref:`the list of frequently-used CMake variables <cmake_frequently_used_variables>`
+for more information.
 
 To configure LLVM, follow these steps:
 
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index c3fbb64c2dd9..51a7310a8a72 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -30,6 +30,12 @@ describes the typical workflow of creating a Pull Request and getting it reviewe
 and accepted. This is meant as an overview of the GitHub workflow, for complete
 documentation refer to `GitHub's documentation <https://docs.github.com/pull-requests>`_.
 
+.. note::
+   If you are using a Pull Request for purposes other than review
+   (eg: precommit CI results, convenient web-based reverts, etc)
+   `skip-precommit-approval<https://github.com/llvm/llvm-project/labels?q=skip-precommit-approval>`
+   label to the PR.
+
 GitHub Tools
 ------------
 You can interact with GitHub in several ways: via git command line tools,
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 22acc6c9cb37..6a55b1205a76 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -296,6 +296,10 @@ pipeline, immediately after the link stage. The ``internalize`` pass is also
 recommended to remove unused math functions from the resulting PTX. For an
 input IR module ``module.bc``, the following compilation flow is recommended:
 
+The ``NVVMReflect`` pass will attempt to remove dead code even without
+optimizations. This allows potentially incompatible instructions to be avoided
+at all optimizations levels by using the ``__CUDA_ARCH`` argument.
+
 1. Save list of external functions in ``module.bc``
 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
 3. Internalize all functions not in list from (1)
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 05d8eea3add4..120e8b637136 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -122,6 +122,9 @@ Changes to the Python bindings
 Changes to the C API
 --------------------
 
+* Added ``LLVMGetBlockAddressFunction`` and ``LLVMGetBlockAddressBasicBlock``
+  functions for accessing the values in a blockaddress constant.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index e871fed43a7c..a0577678e20f 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -1,4 +1,4 @@
-# What's all this then?
+# Debug info migration: From intrinsics to records
 
 We're planning on removing debug info intrinsics from LLVM, as they're slow, unwieldy and can confuse optimisation passes if they're not expecting them. Instead of having a sequence of instructions that looks like this:
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 83530ae7b513..09746bdaf0c9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2328,6 +2328,16 @@ LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
                                     LLVMValueRef MaskConstant);
 LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB);
 
+/**
+ * Gets the function associated with a given BlockAddress constant value.
+ */
+LLVMValueRef LLVMGetBlockAddressFunction(LLVMValueRef BlockAddr);
+
+/**
+ * Gets the basic block associated with a given BlockAddress constant value.
+ */
+LLVMBasicBlockRef LLVMGetBlockAddressBasicBlock(LLVMValueRef BlockAddr);
+
 /** Deprecated: Use LLVMGetInlineAsm instead. */
 LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty,
                                 const char *AsmString, const char *Constraints,
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index e0de1afcc941..0eaa77b6dd81 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -42,7 +42,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl {
 
 public:
   using iterator_category = std::forward_iterator_tag;
-  using difference_type   = void;
+  using difference_type   = std::ptrdiff_t;
   using value_type        = int;
   using pointer           = value_type*;
   using reference         = value_type&;
diff --git a/llvm/include/llvm/ADT/ilist_iterator.h b/llvm/include/llvm/ADT/ilist_iterator.h
index 9047b9b73959..2393c4d2c403 100644
--- a/llvm/include/llvm/ADT/ilist_iterator.h
+++ b/llvm/include/llvm/ADT/ilist_iterator.h
@@ -202,17 +202,12 @@ private:
 
   node_pointer NodePtr = nullptr;
 
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-  // (Default: Off) Allow extra position-information flags to be stored
-  // in iterators, in aid of removing debug-info intrinsics from LLVM.
-
   /// Is this position intended to contain any debug-info immediately before
   /// the position?
   mutable bool HeadInclusiveBit = false;
   /// Is this position intended to contain any debug-info immediately after
   /// the position?
   mutable bool TailInclusiveBit = false;
-#endif
 
 public:
   /// Create from an ilist_node.
@@ -231,10 +226,8 @@ public:
       const ilist_iterator_w_bits<OptionsT, IsReverse, RHSIsConst> &RHS,
       std::enable_if_t<IsConst || !RHSIsConst, void *> = nullptr)
       : NodePtr(RHS.NodePtr) {
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = RHS.HeadInclusiveBit;
     TailInclusiveBit = RHS.TailInclusiveBit;
-#endif
   }
 
   // This is templated so that we can allow assigning to a const iterator from
@@ -243,10 +236,8 @@ public:
   std::enable_if_t<IsConst || !RHSIsConst, ilist_iterator_w_bits &>
   operator=(const ilist_iterator_w_bits<OptionsT, IsReverse, RHSIsConst> &RHS) {
     NodePtr = RHS.NodePtr;
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = RHS.HeadInclusiveBit;
     TailInclusiveBit = RHS.TailInclusiveBit;
-#endif
     return *this;
   }
 
@@ -280,10 +271,8 @@ public:
           const_cast<typename ilist_iterator_w_bits<OptionsT, IsReverse,
                                                     false>::node_reference>(
               *NodePtr));
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
       New.HeadInclusiveBit = HeadInclusiveBit;
       New.TailInclusiveBit = TailInclusiveBit;
-#endif
       return New;
     }
     return ilist_iterator_w_bits<OptionsT, IsReverse, false>();
@@ -309,18 +298,14 @@ public:
   // Increment and decrement operators...
   ilist_iterator_w_bits &operator--() {
     NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev();
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = false;
     TailInclusiveBit = false;
-#endif
     return *this;
   }
   ilist_iterator_w_bits &operator++() {
     NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext();
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
     HeadInclusiveBit = false;
     TailInclusiveBit = false;
-#endif
     return *this;
   }
   ilist_iterator_w_bits operator--(int) {
@@ -340,18 +325,10 @@ public:
   /// Check for end.  Only valid if ilist_sentinel_tracking<true>.
   bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; }
 
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
   bool getHeadBit() const { return HeadInclusiveBit; }
   bool getTailBit() const { return TailInclusiveBit; }
   void setHeadBit(bool SetBit) const { HeadInclusiveBit = SetBit; }
   void setTailBit(bool SetBit) const { TailInclusiveBit = SetBit; }
-#else
-  // Store and return no information if we're not using this feature.
-  bool getHeadBit() const { return false; }
-  bool getTailBit() const { return false; }
-  void setHeadBit(bool SetBit) const { (void)SetBit; }
-  void setTailBit(bool SetBit) const { (void)SetBit; }
-#endif
 };
 
 template <typename From> struct simplify_type;
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index ca9bb7244d66..d62ec9ce4911 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -39,6 +39,8 @@ namespace llvm {
   M(int64_t, {1}, jump_table_penalty, "Accumulation of costs for jump tables") \
   M(int64_t, {1}, case_cluster_penalty,                                        \
     "Accumulation of costs for case clusters")                                 \
+  M(int64_t, {1}, switch_default_dest_penalty,                                 \
+    "Accumulation of costs for switch default destination")                    \
   M(int64_t, {1}, switch_penalty,                                              \
     "Accumulation of costs for switch statements")                             \
   M(int64_t, {1}, unsimplified_common_instructions,                            \
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index d9287ae9e5e9..06f94f58ae5e 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -248,6 +248,10 @@ struct KnownFPClass {
   /// definitely set or false if the sign bit is definitely unset.
   std::optional<bool> SignBit;
 
+  bool operator==(KnownFPClass Other) const {
+    return KnownFPClasses == Other.KnownFPClasses && SignBit == Other.SignBit;
+  }
+
   /// Return true if it's known this can never be one of the mask entries.
   bool isKnownNever(FPClassTest Mask) const {
     return (KnownFPClasses & Mask) == fcNone;
diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index 522ee37da6e8..72461d0d9c31 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -716,7 +716,10 @@ enum ImportNameType : unsigned {
   IMPORT_NAME_NOPREFIX = 2,
   /// The import name is the public symbol name, but skipping the leading ?,
   /// @, or optionally _, and truncating at the first @.
-  IMPORT_NAME_UNDECORATE = 3
+  IMPORT_NAME_UNDECORATE = 3,
+  /// The import name is specified as a separate string in the import library
+  /// object file.
+  IMPORT_NAME_EXPORTAS = 4
 };
 
 enum class GuardFlags : uint32_t {
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index efd41f9812ba..8e0356a8efeb 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -790,11 +790,16 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1201       = 0x04e,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC      = 0x051,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC   = 0x052,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
+  EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
   // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1201,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index fbd198a75a24..a7fbf4aeb744 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -897,7 +897,7 @@ private:
   virtual void emitModuleCommandLines(Module &M);
 
   GCMetadataPrinter *getOrCreateGCPrinter(GCStrategy &S);
-  void emitGlobalAlias(Module &M, const GlobalAlias &GA);
+  virtual void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
   void emitGlobalIFunc(Module &M, const GlobalIFunc &GI);
 
 private:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 5fd80e5f199c..637c2c71b024 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -1084,7 +1084,7 @@ public:
         },
         [=](const LegalityQuery &Query) {
           LLT T = Query.Types[LargeTypeIdx];
-          if (T.isVector() && T.getElementType().isPointer())
+          if (T.isPointerVector())
             T = T.changeElementType(LLT::scalar(T.getScalarSizeInBits()));
           return std::make_pair(TypeIdx, T);
         });
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 349d1286c8dc..8cb0bc9fd981 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1179,6 +1179,12 @@ enum NodeType {
   /// counter-like register (or other high accuracy low latency clock source).
   READCYCLECOUNTER,
 
+  /// READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
+  /// It has the same semantics as the READCYCLECOUNTER implementation except
+  /// that the result is the content of the architecture-specific fixed
+  /// frequency counter suitable for measuring elapsed time.
+  READSTEADYCOUNTER,
+
   /// HANDLENODE node - Used as a handle for various purposes.
   HANDLENODE,
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b9ec30754f0c..7bb12d8f065c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -668,6 +668,8 @@ public:
                             bool isTarget = false);
   SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL,
                                  bool LegalTypes = true);
+  SDValue getShiftAmountConstant(const APInt &Val, EVT VT, const SDLoc &DL,
+                                 bool LegalTypes = true);
   SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
                                bool isTarget = false);
 
@@ -1613,10 +1615,10 @@ public:
   /// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
   SDValue expandVACopy(SDNode *Node);
 
-  /// Returs an GlobalAddress of the function from the current module with
+  /// Return a GlobalAddress of the function from the current module with
   /// name matching the given ExternalSymbol. Additionally can provide the
   /// matched function.
-  /// Panics the function doesn't exists.
+  /// Panic if the function doesn't exist.
   SDValue getSymbolFunctionGlobalAddress(SDValue Op,
                                          Function **TargetFunction = nullptr);
 
@@ -2255,7 +2257,7 @@ public:
   std::pair<EVT, EVT> GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
                                                bool *HiIsEmpty) const;
 
-  /// Split the vector with EXTRACT_SUBVECTOR using the provides
+  /// Split the vector with EXTRACT_SUBVECTOR using the provided
   /// VTs and return the low/high part.
   std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
                                           const EVT &LoVT, const EVT &HiVT);
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 55baaf867e73..105473830653 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -97,192 +97,193 @@ def v128i4  : VTVec<128,  i4, 32>;   //  128 x i4 vector value
 
 def v1i8    : VTVec<1,    i8, 33>;  //    1 x i8 vector value
 def v2i8    : VTVec<2,    i8, 34>;  //    2 x i8 vector value
-def v4i8    : VTVec<4,    i8, 35>;  //    4 x i8 vector value
-def v8i8    : VTVec<8,    i8, 36>;  //    8 x i8 vector value
-def v16i8   : VTVec<16,   i8, 37>;  //   16 x i8 vector value
-def v32i8   : VTVec<32,   i8, 38>;  //   32 x i8 vector value
-def v64i8   : VTVec<64,   i8, 39>;  //   64 x i8 vector value
-def v128i8  : VTVec<128,  i8, 40>;  //  128 x i8 vector value
-def v256i8  : VTVec<256,  i8, 41>;  //  256 x i8 vector value
-def v512i8  : VTVec<512,  i8, 42>;  //  512 x i8 vector value
-def v1024i8 : VTVec<1024, i8, 43>;  // 1024 x i8 vector value
-
-def v1i16   : VTVec<1,   i16, 44>;  //   1 x i16 vector value
-def v2i16   : VTVec<2,   i16, 45>;  //   2 x i16 vector value
-def v3i16   : VTVec<3,   i16, 46>;  //   3 x i16 vector value
-def v4i16   : VTVec<4,   i16, 47>;  //   4 x i16 vector value
-def v8i16   : VTVec<8,   i16, 48>;  //   8 x i16 vector value
-def v16i16  : VTVec<16,  i16, 49>;  //  16 x i16 vector value
-def v32i16  : VTVec<32,  i16, 50>;  //  32 x i16 vector value
-def v64i16  : VTVec<64,  i16, 51>;  //  64 x i16 vector value
-def v128i16 : VTVec<128, i16, 52>;  // 128 x i16 vector value
-def v256i16 : VTVec<256, i16, 53>;  // 256 x i16 vector value
-def v512i16 : VTVec<512, i16, 54>;  // 512 x i16 vector value
-
-def v1i32    : VTVec<1,    i32, 55>;  //    1 x i32 vector value
-def v2i32    : VTVec<2,    i32, 56>;  //    2 x i32 vector value
-def v3i32    : VTVec<3,    i32, 57>;  //    3 x i32 vector value
-def v4i32    : VTVec<4,    i32, 58>;  //    4 x i32 vector value
-def v5i32    : VTVec<5,    i32, 59>;  //    5 x i32 vector value
-def v6i32    : VTVec<6,    i32, 60>;  //    6 x f32 vector value
-def v7i32    : VTVec<7,    i32, 61>;  //    7 x f32 vector value
-def v8i32    : VTVec<8,    i32, 62>;  //    8 x i32 vector value
-def v9i32    : VTVec<9,    i32, 63>;  //    9 x i32 vector value
-def v10i32   : VTVec<10,   i32, 64>;  //   10 x i32 vector value
-def v11i32   : VTVec<11,   i32, 65>;  //   11 x i32 vector value
-def v12i32   : VTVec<12,   i32, 66>;  //   12 x i32 vector value
-def v16i32   : VTVec<16,   i32, 67>;  //   16 x i32 vector value
-def v32i32   : VTVec<32,   i32, 68>;  //   32 x i32 vector value
-def v64i32   : VTVec<64,   i32, 69>;  //   64 x i32 vector value
-def v128i32  : VTVec<128,  i32, 70>;  //  128 x i32 vector value
-def v256i32  : VTVec<256,  i32, 71>;  //  256 x i32 vector value
-def v512i32  : VTVec<512,  i32, 72>;  //  512 x i32 vector value
-def v1024i32 : VTVec<1024, i32, 73>;  // 1024 x i32 vector value
-def v2048i32 : VTVec<2048, i32, 74>;  // 2048 x i32 vector value
-
-def v1i64   : VTVec<1,   i64, 75>;  //   1 x i64 vector value
-def v2i64   : VTVec<2,   i64, 76>;  //   2 x i64 vector value
-def v3i64   : VTVec<3,   i64, 77>;  //   3 x i64 vector value
-def v4i64   : VTVec<4,   i64, 78>;  //   4 x i64 vector value
-def v8i64   : VTVec<8,   i64, 79>;  //   8 x i64 vector value
-def v16i64  : VTVec<16,  i64, 80>;  //  16 x i64 vector value
-def v32i64  : VTVec<32,  i64, 81>;  //  32 x i64 vector value
-def v64i64  : VTVec<64,  i64, 82>;  //  64 x i64 vector value
-def v128i64 : VTVec<128, i64, 83>;  // 128 x i64 vector value
-def v256i64 : VTVec<256, i64, 84>;  // 256 x i64 vector value
-
-def v1i128  : VTVec<1,  i128, 85>;  //  1 x i128 vector value
-
-def v1f16    : VTVec<1,    f16,  86>;  //    1 x f16 vector value
-def v2f16    : VTVec<2,    f16,  87>;  //    2 x f16 vector value
-def v3f16    : VTVec<3,    f16,  88>;  //    3 x f16 vector value
-def v4f16    : VTVec<4,    f16,  89>;  //    4 x f16 vector value
-def v8f16    : VTVec<8,    f16,  90>;  //    8 x f16 vector value
-def v16f16   : VTVec<16,   f16,  91>;  //   16 x f16 vector value
-def v32f16   : VTVec<32,   f16,  92>;  //   32 x f16 vector value
-def v64f16   : VTVec<64,   f16,  93>;  //   64 x f16 vector value
-def v128f16  : VTVec<128,  f16,  94>;  //  128 x f16 vector value
-def v256f16  : VTVec<256,  f16,  95>;  //  256 x f16 vector value
-def v512f16  : VTVec<512,  f16,  96>;  //  512 x f16 vector value
-
-def v2bf16   : VTVec<2,   bf16,  97>;  //    2 x bf16 vector value
-def v3bf16   : VTVec<3,   bf16,  98>;  //    3 x bf16 vector value
-def v4bf16   : VTVec<4,   bf16,  99>;  //    4 x bf16 vector value
-def v8bf16   : VTVec<8,   bf16, 100>;  //    8 x bf16 vector value
-def v16bf16  : VTVec<16,  bf16, 101>;  //   16 x bf16 vector value
-def v32bf16  : VTVec<32,  bf16, 102>;  //   32 x bf16 vector value
-def v64bf16  : VTVec<64,  bf16, 103>;  //   64 x bf16 vector value
-def v128bf16 : VTVec<128, bf16, 104>;  //  128 x bf16 vector value
-
-def v1f32    : VTVec<1,    f32, 105>;  //    1 x f32 vector value
-def v2f32    : VTVec<2,    f32, 106>;  //    2 x f32 vector value
-def v3f32    : VTVec<3,    f32, 107>;  //    3 x f32 vector value
-def v4f32    : VTVec<4,    f32, 108>;  //    4 x f32 vector value
-def v5f32    : VTVec<5,    f32, 109>;  //    5 x f32 vector value
-def v6f32    : VTVec<6,    f32, 110>;  //    6 x f32 vector value
-def v7f32    : VTVec<7,    f32, 111>;  //    7 x f32 vector value
-def v8f32    : VTVec<8,    f32, 112>;  //    8 x f32 vector value
-def v9f32    : VTVec<9,    f32, 113>;  //    9 x f32 vector value
-def v10f32   : VTVec<10,   f32, 114>;  //   10 x f32 vector value
-def v11f32   : VTVec<11,   f32, 115>;  //   11 x f32 vector value
-def v12f32   : VTVec<12,   f32, 116>;  //   12 x f32 vector value
-def v16f32   : VTVec<16,   f32, 117>;  //   16 x f32 vector value
-def v32f32   : VTVec<32,   f32, 118>;  //   32 x f32 vector value
-def v64f32   : VTVec<64,   f32, 119>;  //   64 x f32 vector value
-def v128f32  : VTVec<128,  f32, 120>;  //  128 x f32 vector value
-def v256f32  : VTVec<256,  f32, 121>;  //  256 x f32 vector value
-def v512f32  : VTVec<512,  f32, 122>;  //  512 x f32 vector value
-def v1024f32 : VTVec<1024, f32, 123>;  // 1024 x f32 vector value
-def v2048f32 : VTVec<2048, f32, 124>;  // 2048 x f32 vector value
-
-def v1f64    : VTVec<1,    f64, 125>;  //    1 x f64 vector value
-def v2f64    : VTVec<2,    f64, 126>;  //    2 x f64 vector value
-def v3f64    : VTVec<3,    f64, 127>;  //    3 x f64 vector value
-def v4f64    : VTVec<4,    f64, 128>;  //    4 x f64 vector value
-def v8f64    : VTVec<8,    f64, 129>;  //    8 x f64 vector value
-def v16f64   : VTVec<16,   f64, 130>;  //   16 x f64 vector value
-def v32f64   : VTVec<32,   f64, 131>;  //   32 x f64 vector value
-def v64f64   : VTVec<64,   f64, 132>;  //   64 x f64 vector value
-def v128f64  : VTVec<128,  f64, 133>;  //  128 x f64 vector value
-def v256f64  : VTVec<256,  f64, 134>;  //  256 x f64 vector value
-
-def nxv1i1  : VTScalableVec<1,  i1, 135>;  // n x  1 x i1  vector value
-def nxv2i1  : VTScalableVec<2,  i1, 136>;  // n x  2 x i1  vector value
-def nxv4i1  : VTScalableVec<4,  i1, 137>;  // n x  4 x i1  vector value
-def nxv8i1  : VTScalableVec<8,  i1, 138>;  // n x  8 x i1  vector value
-def nxv16i1 : VTScalableVec<16, i1, 139>;  // n x 16 x i1  vector value
-def nxv32i1 : VTScalableVec<32, i1, 140>;  // n x 32 x i1  vector value
-def nxv64i1 : VTScalableVec<64, i1, 141>;  // n x 64 x i1  vector value
-
-def nxv1i8  : VTScalableVec<1,  i8, 142>;  // n x  1 x i8  vector value
-def nxv2i8  : VTScalableVec<2,  i8, 143>;  // n x  2 x i8  vector value
-def nxv4i8  : VTScalableVec<4,  i8, 144>;  // n x  4 x i8  vector value
-def nxv8i8  : VTScalableVec<8,  i8, 145>;  // n x  8 x i8  vector value
-def nxv16i8 : VTScalableVec<16, i8, 146>;  // n x 16 x i8  vector value
-def nxv32i8 : VTScalableVec<32, i8, 147>;  // n x 32 x i8  vector value
-def nxv64i8 : VTScalableVec<64, i8, 148>;  // n x 64 x i8  vector value
-
-def nxv1i16  : VTScalableVec<1,  i16, 149>;  // n x  1 x i16 vector value
-def nxv2i16  : VTScalableVec<2,  i16, 150>;  // n x  2 x i16 vector value
-def nxv4i16  : VTScalableVec<4,  i16, 151>;  // n x  4 x i16 vector value
-def nxv8i16  : VTScalableVec<8,  i16, 152>;  // n x  8 x i16 vector value
-def nxv16i16 : VTScalableVec<16, i16, 153>;  // n x 16 x i16 vector value
-def nxv32i16 : VTScalableVec<32, i16, 154>;  // n x 32 x i16 vector value
-
-def nxv1i32  : VTScalableVec<1,  i32, 155>;  // n x  1 x i32 vector value
-def nxv2i32  : VTScalableVec<2,  i32, 156>;  // n x  2 x i32 vector value
-def nxv4i32  : VTScalableVec<4,  i32, 157>;  // n x  4 x i32 vector value
-def nxv8i32  : VTScalableVec<8,  i32, 158>;  // n x  8 x i32 vector value
-def nxv16i32 : VTScalableVec<16, i32, 159>;  // n x 16 x i32 vector value
-def nxv32i32 : VTScalableVec<32, i32, 160>;  // n x 32 x i32 vector value
-
-def nxv1i64  : VTScalableVec<1,  i64, 161>;  // n x  1 x i64 vector value
-def nxv2i64  : VTScalableVec<2,  i64, 162>;  // n x  2 x i64 vector value
-def nxv4i64  : VTScalableVec<4,  i64, 163>;  // n x  4 x i64 vector value
-def nxv8i64  : VTScalableVec<8,  i64, 164>;  // n x  8 x i64 vector value
-def nxv16i64 : VTScalableVec<16, i64, 165>;  // n x 16 x i64 vector value
-def nxv32i64 : VTScalableVec<32, i64, 166>;  // n x 32 x i64 vector value
-
-def nxv1f16  : VTScalableVec<1,  f16, 167>;  // n x  1 x  f16 vector value
-def nxv2f16  : VTScalableVec<2,  f16, 168>;  // n x  2 x  f16 vector value
-def nxv4f16  : VTScalableVec<4,  f16, 169>;  // n x  4 x  f16 vector value
-def nxv8f16  : VTScalableVec<8,  f16, 170>;  // n x  8 x  f16 vector value
-def nxv16f16 : VTScalableVec<16, f16, 171>;  // n x 16 x  f16 vector value
-def nxv32f16 : VTScalableVec<32, f16, 172>;  // n x 32 x  f16 vector value
-
-def nxv1bf16  : VTScalableVec<1,  bf16, 173>;  // n x  1 x bf16 vector value
-def nxv2bf16  : VTScalableVec<2,  bf16, 174>;  // n x  2 x bf16 vector value
-def nxv4bf16  : VTScalableVec<4,  bf16, 175>;  // n x  4 x bf16 vector value
-def nxv8bf16  : VTScalableVec<8,  bf16, 176>;  // n x  8 x bf16 vector value
-def nxv16bf16 : VTScalableVec<16, bf16, 177>;  // n x 16 x bf16 vector value
-def nxv32bf16 : VTScalableVec<32, bf16, 178>;  // n x 32 x bf16 vector value
-
-def nxv1f32  : VTScalableVec<1,  f32, 179>;  // n x  1 x  f32 vector value
-def nxv2f32  : VTScalableVec<2,  f32, 180>;  // n x  2 x  f32 vector value
-def nxv4f32  : VTScalableVec<4,  f32, 181>;  // n x  4 x  f32 vector value
-def nxv8f32  : VTScalableVec<8,  f32, 182>;  // n x  8 x  f32 vector value
-def nxv16f32 : VTScalableVec<16, f32, 183>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : VTScalableVec<1,  f64, 184>;  // n x  1 x  f64 vector value
-def nxv2f64  : VTScalableVec<2,  f64, 185>;  // n x  2 x  f64 vector value
-def nxv4f64  : VTScalableVec<4,  f64, 186>;  // n x  4 x  f64 vector value
-def nxv8f64  : VTScalableVec<8,  f64, 187>;  // n x  8 x  f64 vector value
-
-def x86mmx    : ValueType<64,   188>;  // X86 MMX value
-def FlagVT    : ValueType<0,    189> { // Pre-RA sched glue
+def v3i8    : VTVec<3,    i8, 35>;  //    3 x i8 vector value
+def v4i8    : VTVec<4,    i8, 36>;  //    4 x i8 vector value
+def v8i8    : VTVec<8,    i8, 37>;  //    8 x i8 vector value
+def v16i8   : VTVec<16,   i8, 38>;  //   16 x i8 vector value
+def v32i8   : VTVec<32,   i8, 39>;  //   32 x i8 vector value
+def v64i8   : VTVec<64,   i8, 40>;  //   64 x i8 vector value
+def v128i8  : VTVec<128,  i8, 41>;  //  128 x i8 vector value
+def v256i8  : VTVec<256,  i8, 42>;  //  256 x i8 vector value
+def v512i8  : VTVec<512,  i8, 43>;  //  512 x i8 vector value
+def v1024i8 : VTVec<1024, i8, 44>;  // 1024 x i8 vector value
+
+def v1i16   : VTVec<1,   i16, 45>;  //   1 x i16 vector value
+def v2i16   : VTVec<2,   i16, 46>;  //   2 x i16 vector value
+def v3i16   : VTVec<3,   i16, 47>;  //   3 x i16 vector value
+def v4i16   : VTVec<4,   i16, 48>;  //   4 x i16 vector value
+def v8i16   : VTVec<8,   i16, 49>;  //   8 x i16 vector value
+def v16i16  : VTVec<16,  i16, 50>;  //  16 x i16 vector value
+def v32i16  : VTVec<32,  i16, 51>;  //  32 x i16 vector value
+def v64i16  : VTVec<64,  i16, 52>;  //  64 x i16 vector value
+def v128i16 : VTVec<128, i16, 53>;  // 128 x i16 vector value
+def v256i16 : VTVec<256, i16, 54>;  // 256 x i16 vector value
+def v512i16 : VTVec<512, i16, 55>;  // 512 x i16 vector value
+
+def v1i32    : VTVec<1,    i32, 56>;  //    1 x i32 vector value
+def v2i32    : VTVec<2,    i32, 57>;  //    2 x i32 vector value
+def v3i32    : VTVec<3,    i32, 58>;  //    3 x i32 vector value
+def v4i32    : VTVec<4,    i32, 59>;  //    4 x i32 vector value
+def v5i32    : VTVec<5,    i32, 60>;  //    5 x i32 vector value
+def v6i32    : VTVec<6,    i32, 61>;  //    6 x f32 vector value
+def v7i32    : VTVec<7,    i32, 62>;  //    7 x f32 vector value
+def v8i32    : VTVec<8,    i32, 63>;  //    8 x i32 vector value
+def v9i32    : VTVec<9,    i32, 64>;  //    9 x i32 vector value
+def v10i32   : VTVec<10,   i32, 65>;  //   10 x i32 vector value
+def v11i32   : VTVec<11,   i32, 66>;  //   11 x i32 vector value
+def v12i32   : VTVec<12,   i32, 67>;  //   12 x i32 vector value
+def v16i32   : VTVec<16,   i32, 68>;  //   16 x i32 vector value
+def v32i32   : VTVec<32,   i32, 69>;  //   32 x i32 vector value
+def v64i32   : VTVec<64,   i32, 70>;  //   64 x i32 vector value
+def v128i32  : VTVec<128,  i32, 71>;  //  128 x i32 vector value
+def v256i32  : VTVec<256,  i32, 72>;  //  256 x i32 vector value
+def v512i32  : VTVec<512,  i32, 73>;  //  512 x i32 vector value
+def v1024i32 : VTVec<1024, i32, 74>;  // 1024 x i32 vector value
+def v2048i32 : VTVec<2048, i32, 75>;  // 2048 x i32 vector value
+
+def v1i64   : VTVec<1,   i64, 76>;  //   1 x i64 vector value
+def v2i64   : VTVec<2,   i64, 77>;  //   2 x i64 vector value
+def v3i64   : VTVec<3,   i64, 78>;  //   3 x i64 vector value
+def v4i64   : VTVec<4,   i64, 79>;  //   4 x i64 vector value
+def v8i64   : VTVec<8,   i64, 80>;  //   8 x i64 vector value
+def v16i64  : VTVec<16,  i64, 81>;  //  16 x i64 vector value
+def v32i64  : VTVec<32,  i64, 82>;  //  32 x i64 vector value
+def v64i64  : VTVec<64,  i64, 83>;  //  64 x i64 vector value
+def v128i64 : VTVec<128, i64, 84>;  // 128 x i64 vector value
+def v256i64 : VTVec<256, i64, 85>;  // 256 x i64 vector value
+
+def v1i128  : VTVec<1,  i128, 86>;  //  1 x i128 vector value
+
+def v1f16    : VTVec<1,    f16,  87>;  //    1 x f16 vector value
+def v2f16    : VTVec<2,    f16,  88>;  //    2 x f16 vector value
+def v3f16    : VTVec<3,    f16,  89>;  //    3 x f16 vector value
+def v4f16    : VTVec<4,    f16,  90>;  //    4 x f16 vector value
+def v8f16    : VTVec<8,    f16,  91>;  //    8 x f16 vector value
+def v16f16   : VTVec<16,   f16,  92>;  //   16 x f16 vector value
+def v32f16   : VTVec<32,   f16,  93>;  //   32 x f16 vector value
+def v64f16   : VTVec<64,   f16,  94>;  //   64 x f16 vector value
+def v128f16  : VTVec<128,  f16,  95>;  //  128 x f16 vector value
+def v256f16  : VTVec<256,  f16,  96>;  //  256 x f16 vector value
+def v512f16  : VTVec<512,  f16,  97>;  //  512 x f16 vector value
+
+def v2bf16   : VTVec<2,   bf16,  98>;  //    2 x bf16 vector value
+def v3bf16   : VTVec<3,   bf16,  99>;  //    3 x bf16 vector value
+def v4bf16   : VTVec<4,   bf16, 100>;  //    4 x bf16 vector value
+def v8bf16   : VTVec<8,   bf16, 101>;  //    8 x bf16 vector value
+def v16bf16  : VTVec<16,  bf16, 102>;  //   16 x bf16 vector value
+def v32bf16  : VTVec<32,  bf16, 103>;  //   32 x bf16 vector value
+def v64bf16  : VTVec<64,  bf16, 104>;  //   64 x bf16 vector value
+def v128bf16 : VTVec<128, bf16, 105>;  //  128 x bf16 vector value
+
+def v1f32    : VTVec<1,    f32, 106>;  //    1 x f32 vector value
+def v2f32    : VTVec<2,    f32, 107>;  //    2 x f32 vector value
+def v3f32    : VTVec<3,    f32, 108>;  //    3 x f32 vector value
+def v4f32    : VTVec<4,    f32, 109>;  //    4 x f32 vector value
+def v5f32    : VTVec<5,    f32, 110>;  //    5 x f32 vector value
+def v6f32    : VTVec<6,    f32, 111>;  //    6 x f32 vector value
+def v7f32    : VTVec<7,    f32, 112>;  //    7 x f32 vector value
+def v8f32    : VTVec<8,    f32, 113>;  //    8 x f32 vector value
+def v9f32    : VTVec<9,    f32, 114>;  //    9 x f32 vector value
+def v10f32   : VTVec<10,   f32, 115>;  //   10 x f32 vector value
+def v11f32   : VTVec<11,   f32, 116>;  //   11 x f32 vector value
+def v12f32   : VTVec<12,   f32, 117>;  //   12 x f32 vector value
+def v16f32   : VTVec<16,   f32, 118>;  //   16 x f32 vector value
+def v32f32   : VTVec<32,   f32, 119>;  //   32 x f32 vector value
+def v64f32   : VTVec<64,   f32, 120>;  //   64 x f32 vector value
+def v128f32  : VTVec<128,  f32, 121>;  //  128 x f32 vector value
+def v256f32  : VTVec<256,  f32, 122>;  //  256 x f32 vector value
+def v512f32  : VTVec<512,  f32, 123>;  //  512 x f32 vector value
+def v1024f32 : VTVec<1024, f32, 124>;  // 1024 x f32 vector value
+def v2048f32 : VTVec<2048, f32, 125>;  // 2048 x f32 vector value
+
+def v1f64    : VTVec<1,    f64, 126>;  //    1 x f64 vector value
+def v2f64    : VTVec<2,    f64, 127>;  //    2 x f64 vector value
+def v3f64    : VTVec<3,    f64, 128>;  //    3 x f64 vector value
+def v4f64    : VTVec<4,    f64, 129>;  //    4 x f64 vector value
+def v8f64    : VTVec<8,    f64, 130>;  //    8 x f64 vector value
+def v16f64   : VTVec<16,   f64, 131>;  //   16 x f64 vector value
+def v32f64   : VTVec<32,   f64, 132>;  //   32 x f64 vector value
+def v64f64   : VTVec<64,   f64, 133>;  //   64 x f64 vector value
+def v128f64  : VTVec<128,  f64, 134>;  //  128 x f64 vector value
+def v256f64  : VTVec<256,  f64, 135>;  //  256 x f64 vector value
+
+def nxv1i1  : VTScalableVec<1,  i1, 136>;  // n x  1 x i1  vector value
+def nxv2i1  : VTScalableVec<2,  i1, 137>;  // n x  2 x i1  vector value
+def nxv4i1  : VTScalableVec<4,  i1, 138>;  // n x  4 x i1  vector value
+def nxv8i1  : VTScalableVec<8,  i1, 139>;  // n x  8 x i1  vector value
+def nxv16i1 : VTScalableVec<16, i1, 140>;  // n x 16 x i1  vector value
+def nxv32i1 : VTScalableVec<32, i1, 141>;  // n x 32 x i1  vector value
+def nxv64i1 : VTScalableVec<64, i1, 142>;  // n x 64 x i1  vector value
+
+def nxv1i8  : VTScalableVec<1,  i8, 143>;  // n x  1 x i8  vector value
+def nxv2i8  : VTScalableVec<2,  i8, 144>;  // n x  2 x i8  vector value
+def nxv4i8  : VTScalableVec<4,  i8, 145>;  // n x  4 x i8  vector value
+def nxv8i8  : VTScalableVec<8,  i8, 146>;  // n x  8 x i8  vector value
+def nxv16i8 : VTScalableVec<16, i8, 147>;  // n x 16 x i8  vector value
+def nxv32i8 : VTScalableVec<32, i8, 148>;  // n x 32 x i8  vector value
+def nxv64i8 : VTScalableVec<64, i8, 149>;  // n x 64 x i8  vector value
+
+def nxv1i16  : VTScalableVec<1,  i16, 150>;  // n x  1 x i16 vector value
+def nxv2i16  : VTScalableVec<2,  i16, 151>;  // n x  2 x i16 vector value
+def nxv4i16  : VTScalableVec<4,  i16, 152>;  // n x  4 x i16 vector value
+def nxv8i16  : VTScalableVec<8,  i16, 153>;  // n x  8 x i16 vector value
+def nxv16i16 : VTScalableVec<16, i16, 154>;  // n x 16 x i16 vector value
+def nxv32i16 : VTScalableVec<32, i16, 155>;  // n x 32 x i16 vector value
+
+def nxv1i32  : VTScalableVec<1,  i32, 156>;  // n x  1 x i32 vector value
+def nxv2i32  : VTScalableVec<2,  i32, 157>;  // n x  2 x i32 vector value
+def nxv4i32  : VTScalableVec<4,  i32, 158>;  // n x  4 x i32 vector value
+def nxv8i32  : VTScalableVec<8,  i32, 159>;  // n x  8 x i32 vector value
+def nxv16i32 : VTScalableVec<16, i32, 160>;  // n x 16 x i32 vector value
+def nxv32i32 : VTScalableVec<32, i32, 161>;  // n x 32 x i32 vector value
+
+def nxv1i64  : VTScalableVec<1,  i64, 162>;  // n x  1 x i64 vector value
+def nxv2i64  : VTScalableVec<2,  i64, 163>;  // n x  2 x i64 vector value
+def nxv4i64  : VTScalableVec<4,  i64, 164>;  // n x  4 x i64 vector value
+def nxv8i64  : VTScalableVec<8,  i64, 165>;  // n x  8 x i64 vector value
+def nxv16i64 : VTScalableVec<16, i64, 166>;  // n x 16 x i64 vector value
+def nxv32i64 : VTScalableVec<32, i64, 167>;  // n x 32 x i64 vector value
+
+def nxv1f16  : VTScalableVec<1,  f16, 168>;  // n x  1 x  f16 vector value
+def nxv2f16  : VTScalableVec<2,  f16, 169>;  // n x  2 x  f16 vector value
+def nxv4f16  : VTScalableVec<4,  f16, 170>;  // n x  4 x  f16 vector value
+def nxv8f16  : VTScalableVec<8,  f16, 171>;  // n x  8 x  f16 vector value
+def nxv16f16 : VTScalableVec<16, f16, 172>;  // n x 16 x  f16 vector value
+def nxv32f16 : VTScalableVec<32, f16, 173>;  // n x 32 x  f16 vector value
+
+def nxv1bf16  : VTScalableVec<1,  bf16, 174>;  // n x  1 x bf16 vector value
+def nxv2bf16  : VTScalableVec<2,  bf16, 175>;  // n x  2 x bf16 vector value
+def nxv4bf16  : VTScalableVec<4,  bf16, 176>;  // n x  4 x bf16 vector value
+def nxv8bf16  : VTScalableVec<8,  bf16, 177>;  // n x  8 x bf16 vector value
+def nxv16bf16 : VTScalableVec<16, bf16, 178>;  // n x 16 x bf16 vector value
+def nxv32bf16 : VTScalableVec<32, bf16, 179>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : VTScalableVec<1,  f32, 180>;  // n x  1 x  f32 vector value
+def nxv2f32  : VTScalableVec<2,  f32, 181>;  // n x  2 x  f32 vector value
+def nxv4f32  : VTScalableVec<4,  f32, 182>;  // n x  4 x  f32 vector value
+def nxv8f32  : VTScalableVec<8,  f32, 183>;  // n x  8 x  f32 vector value
+def nxv16f32 : VTScalableVec<16, f32, 184>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : VTScalableVec<1,  f64, 185>;  // n x  1 x  f64 vector value
+def nxv2f64  : VTScalableVec<2,  f64, 186>;  // n x  2 x  f64 vector value
+def nxv4f64  : VTScalableVec<4,  f64, 187>;  // n x  4 x  f64 vector value
+def nxv8f64  : VTScalableVec<8,  f64, 188>;  // n x  8 x  f64 vector value
+
+def x86mmx    : ValueType<64,   189>;  // X86 MMX value
+def FlagVT    : ValueType<0,    190> { // Pre-RA sched glue
   let LLVMName = "Glue";
 }
-def isVoid    : ValueType<0,    190>;  // Produces no value
-def untyped   : ValueType<8,    191> { // Produces an untyped value
+def isVoid    : ValueType<0,    191>;  // Produces no value
+def untyped   : ValueType<8,    192> { // Produces an untyped value
   let LLVMName = "Untyped";
 }
-def funcref   : ValueType<0,    192>;  // WebAssembly's funcref type
-def externref : ValueType<0,    193>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 194>;  // X86 AMX value
-def i64x8     : ValueType<512,  195>;  // 8 Consecutive GPRs (AArch64)
+def funcref   : ValueType<0,    193>;  // WebAssembly's funcref type
+def externref : ValueType<0,    194>;  // WebAssembly's externref type
+def x86amx    : ValueType<8192, 195>;  // X86 AMX value
+def i64x8     : ValueType<512,  196>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,   196>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0,  197>; // SPIR-V's builtin type
+              : ValueType<16,  197>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 198>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index cc33152e06f4..5a16cffb80b3 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -134,15 +134,17 @@ public:
 
   explicit LLT(MVT VT);
 
-  constexpr bool isValid() const { return IsScalar || RawData != 0; }
+  constexpr bool isValid() const { return IsScalar || IsPointer || IsVector; }
 
   constexpr bool isScalar() const { return IsScalar; }
 
-  constexpr bool isPointer() const {
-    return isValid() && IsPointer && !IsVector;
-  }
+  constexpr bool isPointer() const { return IsPointer && !IsVector; }
+
+  constexpr bool isPointerVector() const { return IsPointer && IsVector; }
+
+  constexpr bool isPointerOrPointerVector() const { return IsPointer; }
 
-  constexpr bool isVector() const { return isValid() && IsVector; }
+  constexpr bool isVector() const { return IsVector; }
 
   /// Returns the number of elements in a vector LLT. Must only be called on
   /// vector types.
@@ -209,7 +211,7 @@ public:
   /// but the new element size. Otherwise, return the new element type. Invalid
   /// for pointer types. For pointer types, use changeElementType.
   constexpr LLT changeElementSize(unsigned NewEltSize) const {
-    assert(!getScalarType().isPointer() &&
+    assert(!isPointerOrPointerVector() &&
            "invalid to directly change element size for pointers");
     return isVector() ? LLT::vector(getElementCount(), NewEltSize)
                       : LLT::scalar(NewEltSize);
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index 0bfd06e05bd8..5cdff5ff0e82 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -136,11 +136,13 @@ enum class CPUType : uint16_t {
   HybridX86ARM64 = 0xf7,
   ARM64EC = 0xf8,
   ARM64X = 0xf9,
+  Unknown = 0xff,
   D3D11_Shader = 0x100,
 };
 
 /// These values correspond to the CV_CFL_LANG enumeration in the Microsoft
-/// Debug Interface Access SDK
+/// Debug Interface Access SDK, and are documented here:
+/// https://learn.microsoft.com/en-us/visualstudio/debugger/debug-interface-access/cv-cfl-lang
 enum SourceLanguage : uint8_t {
   C = 0x00,
   Cpp = 0x01,
@@ -161,13 +163,17 @@ enum SourceLanguage : uint8_t {
   HLSL = 0x10,
   ObjC = 0x11,
   ObjCpp = 0x12,
-
+  Swift = 0x13,
+  AliasObj = 0x14,
   Rust = 0x15,
+  Go = 0x16,
 
-  /// The DMD & Swift compilers emit 'D' and 'S', respectively, for the CV
-  /// source language. Microsoft does not have enumerators for them yet.
+  /// The DMD compiler emits 'D' for the CV source language. Microsoft does not
+  /// have an enumerator for it yet.
   D = 'D',
-  Swift = 'S',
+  /// The Swift compiler used to emit 'S' for the CV source language, but
+  /// current versions emit the enumerator defined above.
+  OldSwift = 'S',
 };
 
 /// These values correspond to the CV_call_e enumeration, and are documented
@@ -200,7 +206,8 @@ enum class CallingConvention : uint8_t {
   ClrCall = 0x16,     // clr call
   Inline =
       0x17, // Marker for routines always inlined and thus lacking a convention
-  NearVector = 0x18 // near left to right push with regs, callee pops stack
+  NearVector = 0x18, // near left to right push with regs, callee pops stack
+  Swift = 0x19,      // Swift call
 };
 
 enum class ClassOptions : uint16_t {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index ea73664b1e46..c2365a4c7cf6 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -360,9 +360,9 @@ public:
   ///
   /// \returns true if the .debug_line verifies successfully, false otherwise.
   bool handleDebugStrOffsets();
-  bool verifyDebugStrOffsets(
-      StringRef SectionName, const DWARFSection &Section, StringRef StrData,
-      void (DWARFObject::*)(function_ref<void(const DWARFSection &)>) const);
+  bool verifyDebugStrOffsets(std::optional<dwarf::DwarfFormat> LegacyFormat,
+                             StringRef SectionName, const DWARFSection &Section,
+                             StringRef StrData);
 
   /// Emits any aggregate information collected, depending on the dump options
   void summarize();
diff --git a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
index fd2433a677b8..3abf3bacabdd 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
@@ -22,6 +22,7 @@ namespace gsym {
 struct CUInfo;
 struct FunctionInfo;
 class GsymCreator;
+class OutputAggregator;
 
 /// A class that transforms the DWARF in a DWARFContext into GSYM information
 /// by populating the GsymCreator object that it is constructed with. This
@@ -52,9 +53,9 @@ public:
   ///
   /// \returns An error indicating any fatal issues that happen when parsing
   /// the DWARF, or Error::success() if all goes well.
-  llvm::Error convert(uint32_t NumThreads, raw_ostream *OS);
+  llvm::Error convert(uint32_t NumThreads, OutputAggregator &OS);
 
-  llvm::Error verify(StringRef GsymPath, raw_ostream &OS);
+  llvm::Error verify(StringRef GsymPath, OutputAggregator &OS);
 
 private:
 
@@ -79,7 +80,7 @@ private:
   /// information.
   ///
   /// \param Die The DWARF debug info entry to parse.
-  void handleDie(raw_ostream *Strm, CUInfo &CUI, DWARFDie Die);
+  void handleDie(OutputAggregator &Strm, CUInfo &CUI, DWARFDie Die);
 
   DWARFContext &DICtx;
   GsymCreator &Gsym;
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
index bb52f3f2cd56..855a275725c4 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h
@@ -28,6 +28,7 @@ namespace llvm {
 
 namespace gsym {
 class FileWriter;
+class OutputAggregator;
 
 /// GsymCreator is used to emit GSYM data to a stand alone file or section
 /// within a file.
@@ -360,7 +361,7 @@ public:
   ///         function infos, and function infos that were merged or removed.
   /// \returns An error object that indicates success or failure of the
   ///          finalize.
-  llvm::Error finalize(llvm::raw_ostream &OS);
+  llvm::Error finalize(OutputAggregator &OS);
 
   /// Set the UUID value.
   ///
diff --git a/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
index 2018e0962ca7..fe44e82e0dd2 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/ObjectFileTransformer.h
@@ -13,8 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace object {
 class ObjectFile;
 }
@@ -22,6 +20,7 @@ class ObjectFile;
 namespace gsym {
 
 class GsymCreator;
+class OutputAggregator;
 
 class ObjectFileTransformer {
 public:
@@ -40,8 +39,8 @@ public:
   ///
   /// \returns An error indicating any fatal issues that happen when parsing
   /// the DWARF, or Error::success() if all goes well.
-  static llvm::Error convert(const object::ObjectFile &Obj, raw_ostream *Log,
-                             GsymCreator &Gsym);
+  static llvm::Error convert(const object::ObjectFile &Obj,
+                             OutputAggregator &Output, GsymCreator &Gsym);
 };
 
 } // namespace gsym
diff --git a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
new file mode 100644
index 000000000000..8deea3bff1ed
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
@@ -0,0 +1,75 @@
+//===- DwarfTransformer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
+#define LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/GSYM/ExtractRanges.h"
+
+#include <map>
+#include <string>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace gsym {
+
+class OutputAggregator {
+protected:
+  // A std::map is preferable over an llvm::StringMap for presenting results
+  // in a predictable order.
+  std::map<std::string, unsigned> Aggregation;
+  raw_ostream *Out;
+
+public:
+  OutputAggregator(raw_ostream *out) : Out(out) {}
+
+  size_t GetNumCategories() const { return Aggregation.size(); }
+
+  void Report(StringRef s, std::function<void(raw_ostream &o)> detailCallback) {
+    Aggregation[std::string(s)]++;
+    if (GetOS())
+      detailCallback(*Out);
+  }
+
+  void EnumerateResults(
+      std::function<void(StringRef, unsigned)> handleCounts) const {
+    for (auto &&[name, count] : Aggregation)
+      handleCounts(name, count);
+  }
+
+  raw_ostream *GetOS() const { return Out; }
+
+  // You can just use the stream, and if it's null, nothing happens.
+  // Don't do a lot of stuff like this, but it's convenient for silly stuff.
+  // It doesn't work with things that have custom insertion operators, though.
+  template <typename T> OutputAggregator &operator<<(T &&value) {
+    if (Out != nullptr)
+      *Out << value;
+    return *this;
+  }
+
+  // For multi-threaded usage, we can collect stuff in another aggregator,
+  // then merge it in here
+  void Merge(const OutputAggregator &other) {
+    for (auto &&[name, count] : other.Aggregation) {
+      auto it = Aggregation.find(name);
+      if (it == Aggregation.end())
+        Aggregation.emplace(name, count);
+      else
+        it->second += count;
+    }
+  }
+};
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_GSYM_OUTPUTAGGREGATOR_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
index 6ee2deef04d0..887147a77026 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
@@ -51,11 +51,32 @@ public:
 
   /// Looks up symbols within the given dylib.
   Expected<std::vector<ExecutorSymbolDef>>
-  lookup(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup);
+  lookup(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup) {
+    std::promise<MSVCPExpected<std::vector<ExecutorSymbolDef>>> RP;
+    auto RF = RP.get_future();
+    lookupAsync(H, Lookup, [&RP](auto R) { RP.set_value(std::move(R)); });
+    return RF.get();
+  }
 
   /// Looks up symbols within the given dylib.
   Expected<std::vector<ExecutorSymbolDef>>
-  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup);
+  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup) {
+    std::promise<MSVCPExpected<std::vector<ExecutorSymbolDef>>> RP;
+    auto RF = RP.get_future();
+    lookupAsync(H, Lookup, [&RP](auto R) { RP.set_value(std::move(R)); });
+    return RF.get();
+  }
+
+  using SymbolLookupCompleteFn =
+      unique_function<void(Expected<std::vector<ExecutorSymbolDef>>)>;
+
+  /// Looks up symbols within the given dylib.
+  void lookupAsync(tpctypes::DylibHandle H, const SymbolLookupSet &Lookup,
+                   SymbolLookupCompleteFn Complete);
+
+  /// Looks up symbols within the given dylib.
+  void lookupAsync(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup,
+                   SymbolLookupCompleteFn Complete);
 
 private:
   ExecutorProcessControl &EPC;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index 9e42d6dd615d..6468f2dfc11a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -290,8 +290,26 @@ public:
   /// that correspond to the lookup order. If a required symbol is not
   /// found then this method will return an error. If a weakly referenced
   /// symbol is not found then it be assigned a '0' value.
-  virtual Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) = 0;
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) {
+    std::promise<MSVCPExpected<std::vector<tpctypes::LookupResult>>> RP;
+    auto RF = RP.get_future();
+    lookupSymbolsAsync(Request,
+                       [&RP](auto Result) { RP.set_value(std::move(Result)); });
+    return RF.get();
+  }
+
+  using SymbolLookupCompleteFn =
+      unique_function<void(Expected<std::vector<tpctypes::LookupResult>>)>;
+
+  /// Search for symbols in the target process.
+  ///
+  /// The result of the lookup is a 2-dimensional array of target addresses
+  /// that correspond to the lookup order. If a required symbol is not
+  /// found then this method will return an error. If a weakly referenced
+  /// symbol is not found then it be assigned a '0' value.
+  virtual void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                                  SymbolLookupCompleteFn F) = 0;
 
   /// Run function with a main-like signature.
   virtual Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
@@ -462,8 +480,8 @@ public:
     llvm_unreachable("Unsupported");
   }
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override {
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override {
     llvm_unreachable("Unsupported");
   }
 
@@ -510,8 +528,8 @@ public:
 
   Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override;
 
   Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
index 25b79be48810..c10b8df01cc0 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
@@ -71,8 +71,8 @@ public:
 
   Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
 
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+  void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                          SymbolLookupCompleteFn F) override;
 
   Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 4b5a4423b768..1db4ff2f09bc 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -49,6 +49,7 @@ class DataLayout;
 class StringRef;
 class Type;
 class Value;
+class UnreachableInst;
 
 //===----------------------------------------------------------------------===//
 //                                AllocaInst Class
@@ -3505,6 +3506,12 @@ public:
     return cast<BasicBlock>(getOperand(1));
   }
 
+  /// Returns true if the default branch must result in immediate undefined
+  /// behavior, false otherwise.
+  bool defaultDestUndefined() const {
+    return isa<UnreachableInst>(getDefaultDest()->getFirstNonPHIOrDbg());
+  }
+
   void setDefaultDest(BasicBlock *DefaultCase) {
     setOperand(1, reinterpret_cast<Value*>(DefaultCase));
   }
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3c19c7b06365..4becdd71cd44 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -870,6 +870,8 @@ def int_pcmarker      : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
 
+def int_readsteadycounter : DefaultAttrsIntrinsic<[llvm_i64_ty]>;
+
 // The assume intrinsic is marked InaccessibleMemOnly so that proper control
 // dependencies will be maintained.
 def int_assume : DefaultAttrsIntrinsic<
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index d825dc821564..726cea004606 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1625,7 +1625,8 @@ def int_nvvm_compiler_warn :
     Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
 
 def int_nvvm_reflect :
-  Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem], "llvm.nvvm.reflect">;
+  Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem], "llvm.nvvm.reflect">,
+  ClangBuiltin<"__nvvm_reflect">;
 
 // isspacep.{const, global, local, shared}
 def int_nvvm_isspacep_const
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index ea0074d22a44..057dc64e88c2 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -38,4 +38,8 @@ let TargetPrefix = "spv" in {
   // Expect, Assume Intrinsics
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
   def int_spv_expect : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+  // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
+  def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
+      Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
 }
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index db1f44fea3b4..9d0b8782e8bf 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -844,6 +844,11 @@ struct AAMDNodes {
   /// together. Different from `merge`, where different locations should
   /// overlap each other, `concat` puts non-overlapping locations together.
   AAMDNodes concat(const AAMDNodes &Other) const;
+
+  /// Create a new AAMDNode for accessing \p AccessSize bytes of this AAMDNode.
+  /// If his AAMDNode has !tbaa.struct and \p AccessSize matches the size of the
+  /// field at offset 0, get the TBAA tag describing the accessed field.
+  AAMDNodes adjustForAccess(unsigned AccessSize);
 };
 
 // Specialize DenseMapInfo for AAMDNodes.
@@ -1730,7 +1735,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
 
   explicit NamedMDNode(const Twine &N);
 
-  template <class T1, class T2> class op_iterator_impl {
+  template <class T1> class op_iterator_impl {
     friend class NamedMDNode;
 
     const NamedMDNode *Node = nullptr;
@@ -1740,10 +1745,10 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
 
   public:
     using iterator_category = std::bidirectional_iterator_tag;
-    using value_type = T2;
+    using value_type = T1;
     using difference_type = std::ptrdiff_t;
     using pointer = value_type *;
-    using reference = value_type &;
+    using reference = value_type;
 
     op_iterator_impl() = default;
 
@@ -1804,12 +1809,12 @@ public:
   // ---------------------------------------------------------------------------
   // Operand Iterator interface...
   //
-  using op_iterator = op_iterator_impl<MDNode *, MDNode>;
+  using op_iterator = op_iterator_impl<MDNode *>;
 
   op_iterator op_begin() { return op_iterator(this, 0); }
   op_iterator op_end()   { return op_iterator(this, getNumOperands()); }
 
-  using const_op_iterator = op_iterator_impl<const MDNode *, MDNode>;
+  using const_op_iterator = op_iterator_impl<const MDNode *>;
 
   const_op_iterator op_begin() const { return const_op_iterator(this, 0); }
   const_op_iterator op_end()   const { return const_op_iterator(this, getNumOperands()); }
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 3155e7dc38b6..fed552414298 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1614,6 +1614,21 @@ struct m_SplatOrUndefMask {
   }
 };
 
+template <typename PointerOpTy, typename OffsetOpTy> struct PtrAdd_match {
+  PointerOpTy PointerOp;
+  OffsetOpTy OffsetOp;
+
+  PtrAdd_match(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp)
+      : PointerOp(PointerOp), OffsetOp(OffsetOp) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *GEP = dyn_cast<GEPOperator>(V);
+    return GEP && GEP->getSourceElementType()->isIntegerTy(8) &&
+           PointerOp.match(GEP->getPointerOperand()) &&
+           OffsetOp.match(GEP->idx_begin()->get());
+  }
+};
+
 /// Matches ShuffleVectorInst independently of mask value.
 template <typename V1_t, typename V2_t>
 inline TwoOps_match<V1_t, V2_t, Instruction::ShuffleVector>
@@ -1647,6 +1662,13 @@ inline auto m_GEP(const OperandTypes &...Ops) {
   return AnyOps_match<Instruction::GetElementPtr, OperandTypes...>(Ops...);
 }
 
+/// Matches GEP with i8 source element type
+template <typename PointerOpTy, typename OffsetOpTy>
+inline PtrAdd_match<PointerOpTy, OffsetOpTy>
+m_PtrAdd(const PointerOpTy &PointerOp, const OffsetOpTy &OffsetOp) {
+  return PtrAdd_match<PointerOpTy, OffsetOpTy>(PointerOp, OffsetOp);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for CastInst classes
 //
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index e52f0a4ff86e..fb4d11ec1d4d 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -572,9 +572,10 @@ public:
   bool isValid() const { return SRIter.isValid(); }
 
   /// Moves to the next position.
-  void operator++() {
+  MCSubRegIndexIterator &operator++() {
     ++SRIter;
     ++SRIndex;
+    return *this;
   }
 };
 
@@ -688,9 +689,10 @@ public:
   bool isValid() const { return RUIter.isValid(); }
 
   /// Moves to the next position.
-  void operator++() {
+  MCRegUnitMaskIterator &operator++() {
     ++MaskListIter;
     ++RUIter;
+    return *this;
   }
 };
 
@@ -728,10 +730,11 @@ public:
   }
 
   /// Preincrement to move to the next root register.
-  void operator++() {
+  MCRegUnitRootIterator &operator++() {
     assert(isValid() && "Cannot move off the end of the list.");
     Reg0 = Reg1;
     Reg1 = 0;
+    return *this;
   }
 };
 
@@ -788,10 +791,11 @@ public:
     }
   }
 
-  void operator++() {
+  MCRegAliasIterator &operator++() {
     assert(isValid() && "Cannot move off the end of the list.");
     do advance();
     while (!IncludeSelf && isValid() && *SI == Reg);
+    return *this;
   }
 };
 
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 0d9320ec2efd..383395941475 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -27,12 +27,7 @@
 namespace llvm {
 namespace objcopy {
 
-enum class FileFormat {
-  Unspecified,
-  ELF,
-  Binary,
-  IHex,
-};
+enum class FileFormat { Unspecified, ELF, Binary, IHex, SREC };
 
 // This type keeps track of the machine info for various architectures. This
 // lets us map architecture names to ELF types and the e_machine value of the
diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index a548b2c15c5f..2a5c3d8913b1 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -1362,6 +1362,47 @@ public:
   SectionStrippedError() { setErrorCode(object_error::section_stripped); }
 };
 
+inline std::optional<std::string>
+getArm64ECMangledFunctionName(StringRef Name) {
+  bool IsCppFn = Name[0] == '?';
+  if (IsCppFn && Name.find("$$h") != std::string::npos)
+    return std::nullopt;
+  if (!IsCppFn && Name[0] == '#')
+    return std::nullopt;
+
+  StringRef Prefix = "$$h";
+  size_t InsertIdx = 0;
+  if (IsCppFn) {
+    InsertIdx = Name.find("@@");
+    size_t ThreeAtSignsIdx = Name.find("@@@");
+    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
+      InsertIdx += 2;
+    } else {
+      InsertIdx = Name.find("@");
+      if (InsertIdx != std::string::npos)
+        InsertIdx++;
+    }
+  } else {
+    Prefix = "#";
+  }
+
+  return std::optional<std::string>(
+      (Name.substr(0, InsertIdx) + Prefix + Name.substr(InsertIdx)).str());
+}
+
+inline std::optional<std::string>
+getArm64ECDemangledFunctionName(StringRef Name) {
+  if (Name[0] == '#')
+    return std::string(Name.substr(1));
+  if (Name[0] != '?')
+    return std::nullopt;
+
+  std::pair<StringRef, StringRef> Pair = Name.split("$$h");
+  if (Pair.second.empty())
+    return std::nullopt;
+  return (Pair.first + Pair.second).str();
+}
+
 } // end namespace object
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 45a4a795fd19..402ded0d64fe 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -27,6 +27,9 @@ namespace llvm {
 namespace object {
 
 class COFFImportFile : public SymbolicFile {
+private:
+  enum SymbolIndex { ImpSymbol, ThunkSymbol, ECAuxSymbol, ECThunkSymbol };
+
 public:
   COFFImportFile(MemoryBufferRef Source)
       : SymbolicFile(ID_COFFImportFile, Source) {}
@@ -36,9 +39,23 @@ public:
   void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; }
 
   Error printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override {
-    if (Symb.p == 0)
+    switch (Symb.p) {
+    case ImpSymbol:
       OS << "__imp_";
-    OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header));
+      break;
+    case ECAuxSymbol:
+      OS << "__imp_aux_";
+      break;
+    }
+    const char *Name = Data.getBufferStart() + sizeof(coff_import_header);
+    if (Symb.p != ECThunkSymbol && COFF::isArm64EC(getMachine())) {
+      if (std::optional<std::string> DemangledName =
+              getArm64ECDemangledFunctionName(Name)) {
+        OS << StringRef(*DemangledName);
+        return Error::success();
+      }
+    }
+    OS << StringRef(Name);
     return Error::success();
   }
 
@@ -52,7 +69,12 @@ public:
 
   basic_symbol_iterator symbol_end() const override {
     DataRefImpl Symb;
-    Symb.p = isData() ? 1 : 2;
+    if (isData())
+      Symb.p = ImpSymbol + 1;
+    else if (COFF::isArm64EC(getMachine()))
+      Symb.p = ECThunkSymbol + 1;
+    else
+      Symb.p = ThunkSymbol + 1;
     return BasicSymbolRef(Symb, this);
   }
 
@@ -92,6 +114,10 @@ struct COFFShortExport {
   /// file, this is "baz" in "EXPORTS\nfoo = bar == baz".
   std::string AliasTarget;
 
+  /// Specifies EXPORTAS name. In a .def file, this is "bar" in
+  /// "EXPORTS\nfoo EXPORTAS bar".
+  std::string ExportAs;
+
   uint16_t Ordinal = 0;
   bool Noname = false;
   bool Data = false;
@@ -109,9 +135,20 @@ struct COFFShortExport {
   }
 };
 
-Error writeImportLibrary(StringRef ImportName, StringRef Path,
-                         ArrayRef<COFFShortExport> Exports,
-                         COFF::MachineTypes Machine, bool MinGW);
+/// Writes a COFF import library containing entries described by the Exports
+/// array.
+///
+/// For hybrid targets such as ARM64EC, additional native entry points can be
+/// exposed using the NativeExports parameter. When NativeExports is used, the
+/// output import library will expose these native ARM64 imports alongside the
+/// entries described in the Exports array. Such a library can be used for
+/// linking both ARM64EC and pure ARM64 objects, and the linker will pick only
+/// the exports relevant to the target platform. For non-hybrid targets,
+/// the NativeExports parameter should not be used.
+Error writeImportLibrary(
+    StringRef ImportName, StringRef Path, ArrayRef<COFFShortExport> Exports,
+    COFF::MachineTypes Machine, bool MinGW,
+    ArrayRef<COFFShortExport> NativeExports = std::nullopt);
 
 } // namespace object
 } // namespace llvm
diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index 13d9a17e24c3..b8f7bb45d73a 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -179,6 +179,7 @@ public:
   Expected<SymbolRef::Type> getSymbolType(DataRefImpl Symb) const override;
   Expected<section_iterator> getSymbolSection(DataRefImpl Symb) const override;
   uint32_t getSymbolSectionId(SymbolRef Sym) const;
+  uint32_t getSymbolSize(SymbolRef Sym) const;
 
   // Overrides from SectionRef.
   void moveSectionNext(DataRefImpl &Sec) const override;
diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index ee421b2efc72..5e1b88f4fef6 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -42,6 +42,7 @@ Error emitDebugAddr(raw_ostream &OS, const Data &DI);
 Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
 Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
 Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
+Error emitDebugNames(raw_ostream &OS, const Data &DI);
 
 std::function<Error(raw_ostream &, const Data &)>
 getDWARFEmitterByName(StringRef SecName);
diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index a70ddf3a180a..0b3bea786d31 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -118,6 +118,28 @@ struct Unit {
   std::vector<Entry> Entries;
 };
 
+struct IdxForm {
+  dwarf::Index Idx;
+  dwarf::Form Form;
+};
+
+struct DebugNameAbbreviation {
+  yaml::Hex64 Code;
+  dwarf::Tag Tag;
+  std::vector<IdxForm> Indices;
+};
+
+struct DebugNameEntry {
+  yaml::Hex32 NameStrp;
+  yaml::Hex64 Code;
+  std::vector<yaml::Hex64> Values;
+};
+
+struct DebugNamesSection {
+  std::vector<DebugNameAbbreviation> Abbrevs;
+  std::vector<DebugNameEntry> Entries;
+};
+
 struct File {
   StringRef Name;
   uint64_t DirIdx;
@@ -228,6 +250,7 @@ struct Data {
   std::vector<LineTable> DebugLines;
   std::optional<std::vector<ListTable<RnglistEntry>>> DebugRnglists;
   std::optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
+  std::optional<DebugNamesSection> DebugNames;
 
   bool isEmpty() const;
 
@@ -276,6 +299,9 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(
     llvm::DWARFYAML::ListEntries<DWARFYAML::LoclistEntry>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LoclistEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DWARFOperation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DebugNameEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DebugNameAbbreviation)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::IdxForm)
 
 namespace llvm {
 namespace yaml {
@@ -324,6 +350,19 @@ template <> struct MappingTraits<DWARFYAML::Unit> {
   static void mapping(IO &IO, DWARFYAML::Unit &Unit);
 };
 
+template <> struct MappingTraits<DWARFYAML::DebugNamesSection> {
+  static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
+};
+template <> struct MappingTraits<DWARFYAML::DebugNameEntry> {
+  static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
+};
+template <> struct MappingTraits<DWARFYAML::DebugNameAbbreviation> {
+  static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
+};
+template <> struct MappingTraits<DWARFYAML::IdxForm> {
+  static void mapping(IO &IO, DWARFYAML::IdxForm &);
+};
+
 template <> struct MappingTraits<DWARFYAML::Entry> {
   static void mapping(IO &IO, DWARFYAML::Entry &Entry);
 };
@@ -437,6 +476,16 @@ template <> struct ScalarEnumerationTraits<dwarf::Form> {
   }
 };
 
+#define HANDLE_DW_IDX(unused, name)                                            \
+  io.enumCase(value, "DW_IDX_" #name, dwarf::DW_IDX_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::Index> {
+  static void enumeration(IO &io, dwarf::Index &value) {
+#include "llvm/BinaryFormat/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
 #define HANDLE_DW_UT(unused, name)                                             \
   io.enumCase(value, "DW_UT_" #name, dwarf::DW_UT_##name);
 
diff --git a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
index f1e821fe5fa3..dd359ac8e53d 100644
--- a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
@@ -121,6 +121,9 @@ struct CsectAuxEnt : AuxSymbolEnt {
   // Common fields for both XCOFF32 and XCOFF64.
   std::optional<uint32_t> ParameterHashIndex;
   std::optional<uint16_t> TypeChkSectNum;
+  std::optional<XCOFF::SymbolType> SymbolType;
+  std::optional<uint8_t> SymbolAlignment;
+  // The two previous values can be encoded as a single value.
   std::optional<uint8_t> SymbolAlignmentAndType;
   std::optional<XCOFF::StorageMappingClass> StorageMappingClass;
 
@@ -237,6 +240,10 @@ template <> struct ScalarEnumerationTraits<XCOFF::StorageMappingClass> {
   static void enumeration(IO &IO, XCOFF::StorageMappingClass &Value);
 };
 
+template <> struct ScalarEnumerationTraits<XCOFF::SymbolType> {
+  static void enumeration(IO &IO, XCOFF::SymbolType &Value);
+};
+
 template <> struct ScalarEnumerationTraits<XCOFF::CFileStringType> {
   static void enumeration(IO &IO, XCOFF::CFileStringType &Type);
 };
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index fa6dbd4a4973..80bbfb75185a 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -7,8 +7,7 @@
 //===----------------------------------------------------------------------===//
 /// \file
 ///
-/// Interfaces for registering analysis passes, producing common pass manager
-/// configurations, and parsing of pass pipelines.
+/// Interfaces for producing common pass manager configurations.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -137,16 +136,6 @@ public:
                       raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
                       CodeGenFileType FileType) const;
 
-  void registerModuleAnalyses(ModuleAnalysisManager &) const;
-  void registerFunctionAnalyses(FunctionAnalysisManager &) const;
-  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &) const;
-
-  void registerAnalyses(MachineFunctionAnalysisManager &MFAM) const {
-    registerModuleAnalyses(*MFAM.MAM);
-    registerFunctionAnalyses(*MFAM.FAM);
-    registerMachineFunctionAnalyses(MFAM);
-  }
-
   PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
     return PIC;
   }
@@ -239,14 +228,6 @@ protected:
   CGPassBuilderOption Opt;
   PassInstrumentationCallbacks *PIC;
 
-  /// Target override these hooks to parse target-specific analyses.
-  void registerTargetAnalysis(ModuleAnalysisManager &) const {}
-  void registerTargetAnalysis(FunctionAnalysisManager &) const {}
-  void registerTargetAnalysis(MachineFunctionAnalysisManager &) const {}
-  std::pair<StringRef, bool> getTargetPassNameFromLegacyName(StringRef) const {
-    return {"", false};
-  }
-
   template <typename TMC> TMC &getTM() const { return static_cast<TMC &>(TM); }
   CodeGenOptLevel getOptLevel() const { return TM.getOptLevel(); }
 
@@ -577,52 +558,6 @@ Error CodeGenPassBuilder<Derived>::verifyStartStop(
   return Error::success();
 }
 
-static inline AAManager registerAAAnalyses() {
-  AAManager AA;
-
-  // The order in which these are registered determines their priority when
-  // being queried.
-
-  // Basic AliasAnalysis support.
-  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-  // BasicAliasAnalysis wins if they disagree. This is intended to help
-  // support "obvious" type-punning idioms.
-  AA.registerFunctionAnalysis<TypeBasedAA>();
-  AA.registerFunctionAnalysis<ScopedNoAliasAA>();
-  AA.registerFunctionAnalysis<BasicAA>();
-
-  return AA;
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerModuleAnalyses(
-    ModuleAnalysisManager &MAM) const {
-#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
-  MAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(MAM);
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerFunctionAnalyses(
-    FunctionAnalysisManager &FAM) const {
-  FAM.registerPass([this] { return registerAAAnalyses(); });
-
-#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
-  FAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(FAM);
-}
-
-template <typename Derived>
-void CodeGenPassBuilder<Derived>::registerMachineFunctionAnalyses(
-    MachineFunctionAnalysisManager &MFAM) const {
-#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
-  MFAM.registerPass([&] { return CREATE_PASS; });
-#include "MachinePassRegistry.def"
-  derived().registerTargetAnalysis(MFAM);
-}
-
 template <typename Derived>
 void CodeGenPassBuilder<Derived>::addISelPasses(AddIRPass &addPass) const {
   derived().addGlobalMergePass(addPass);
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 88ec60c7aa33..c6fbdb512b80 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Object/BuildID.h"
+#include "llvm/ProfileData/Coverage/MCDCTypes.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
@@ -249,19 +250,6 @@ struct CounterMappingRegion {
     MCDCBranchRegion
   };
 
-  using MCDCConditionID = unsigned int;
-  struct MCDCParameters {
-    /// Byte Index of Bitmap Coverage Object for a Decision Region.
-    unsigned BitmapIdx = 0;
-
-    /// Number of Conditions used for a Decision Region.
-    unsigned NumConditions = 0;
-
-    /// IDs used to represent a branch region and other branch regions
-    /// evaluated based on True and False branches.
-    MCDCConditionID ID = 0, TrueID = 0, FalseID = 0;
-  };
-
   /// Primary Counter that is also used for Branch Regions (TrueCount).
   Counter Count;
 
@@ -269,7 +257,25 @@ struct CounterMappingRegion {
   Counter FalseCount;
 
   /// Parameters used for Modified Condition/Decision Coverage
-  MCDCParameters MCDCParams;
+  mcdc::Parameters MCDCParams;
+
+  template <class MaybeConstInnerParameters, class MaybeConstMCDCParameters>
+  static auto &getParams(MaybeConstMCDCParameters &MCDCParams) {
+    using InnerParameters =
+        typename std::remove_const<MaybeConstInnerParameters>::type;
+    MaybeConstInnerParameters *Params =
+        std::get_if<InnerParameters>(&MCDCParams);
+    assert(Params && "InnerParameters unavailable");
+    return *Params;
+  }
+
+  const auto &getDecisionParams() const {
+    return getParams<const mcdc::DecisionParameters>(MCDCParams);
+  }
+
+  const auto &getBranchParams() const {
+    return getParams<const mcdc::BranchParameters>(MCDCParams);
+  }
 
   unsigned FileID = 0;
   unsigned ExpandedFileID = 0;
@@ -284,19 +290,20 @@ struct CounterMappingRegion {
         LineStart(LineStart), ColumnStart(ColumnStart), LineEnd(LineEnd),
         ColumnEnd(ColumnEnd), Kind(Kind) {}
 
-  CounterMappingRegion(Counter Count, Counter FalseCount,
-                       MCDCParameters MCDCParams, unsigned FileID,
+  CounterMappingRegion(Counter Count, Counter FalseCount, unsigned FileID,
                        unsigned ExpandedFileID, unsigned LineStart,
                        unsigned ColumnStart, unsigned LineEnd,
-                       unsigned ColumnEnd, RegionKind Kind)
+                       unsigned ColumnEnd, RegionKind Kind,
+                       const mcdc::Parameters &MCDCParams = std::monostate())
       : Count(Count), FalseCount(FalseCount), MCDCParams(MCDCParams),
         FileID(FileID), ExpandedFileID(ExpandedFileID), LineStart(LineStart),
         ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
         Kind(Kind) {}
 
-  CounterMappingRegion(MCDCParameters MCDCParams, unsigned FileID,
-                       unsigned LineStart, unsigned ColumnStart,
-                       unsigned LineEnd, unsigned ColumnEnd, RegionKind Kind)
+  CounterMappingRegion(const mcdc::DecisionParameters &MCDCParams,
+                       unsigned FileID, unsigned LineStart,
+                       unsigned ColumnStart, unsigned LineEnd,
+                       unsigned ColumnEnd, RegionKind Kind)
       : MCDCParams(MCDCParams), FileID(FileID), LineStart(LineStart),
         ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
         Kind(Kind) {}
@@ -333,26 +340,20 @@ struct CounterMappingRegion {
   static CounterMappingRegion
   makeBranchRegion(Counter Count, Counter FalseCount, unsigned FileID,
                    unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
-                   unsigned ColumnEnd) {
-    return CounterMappingRegion(Count, FalseCount, MCDCParameters(), FileID, 0,
-                                LineStart, ColumnStart, LineEnd, ColumnEnd,
-                                BranchRegion);
+                   unsigned ColumnEnd,
+                   const mcdc::Parameters &MCDCParams = std::monostate()) {
+    return CounterMappingRegion(
+        Count, FalseCount, FileID, 0, LineStart, ColumnStart, LineEnd,
+        ColumnEnd,
+        (std::get_if<mcdc::BranchParameters>(&MCDCParams) ? MCDCBranchRegion
+                                                          : BranchRegion),
+        MCDCParams);
   }
 
   static CounterMappingRegion
-  makeBranchRegion(Counter Count, Counter FalseCount, MCDCParameters MCDCParams,
-                   unsigned FileID, unsigned LineStart, unsigned ColumnStart,
-                   unsigned LineEnd, unsigned ColumnEnd) {
-    return CounterMappingRegion(Count, FalseCount, MCDCParams, FileID, 0,
-                                LineStart, ColumnStart, LineEnd, ColumnEnd,
-                                MCDCParams.ID == 0 ? BranchRegion
-                                                   : MCDCBranchRegion);
-  }
-
-  static CounterMappingRegion
-  makeDecisionRegion(MCDCParameters MCDCParams, unsigned FileID,
-                     unsigned LineStart, unsigned ColumnStart, unsigned LineEnd,
-                     unsigned ColumnEnd) {
+  makeDecisionRegion(const mcdc::DecisionParameters &MCDCParams,
+                     unsigned FileID, unsigned LineStart, unsigned ColumnStart,
+                     unsigned LineEnd, unsigned ColumnEnd) {
     return CounterMappingRegion(MCDCParams, FileID, LineStart, ColumnStart,
                                 LineEnd, ColumnEnd, MCDCDecisionRegion);
   }
@@ -407,17 +408,20 @@ private:
   LineColPairMap CondLoc;
 
 public:
-  MCDCRecord(CounterMappingRegion Region, TestVectors TV,
-             TVPairMap IndependencePairs, BoolVector Folded, CondIDMap PosToID,
-             LineColPairMap CondLoc)
-      : Region(Region), TV(TV), IndependencePairs(IndependencePairs),
-        Folded(Folded), PosToID(PosToID), CondLoc(CondLoc){};
+  MCDCRecord(const CounterMappingRegion &Region, TestVectors &&TV,
+             TVPairMap &&IndependencePairs, BoolVector &&Folded,
+             CondIDMap &&PosToID, LineColPairMap &&CondLoc)
+      : Region(Region), TV(std::move(TV)),
+        IndependencePairs(std::move(IndependencePairs)),
+        Folded(std::move(Folded)), PosToID(std::move(PosToID)),
+        CondLoc(std::move(CondLoc)){};
 
   CounterMappingRegion getDecisionRegion() const { return Region; }
   unsigned getNumConditions() const {
-    assert(Region.MCDCParams.NumConditions != 0 &&
+    unsigned NumConditions = Region.getDecisionParams().NumConditions;
+    assert(NumConditions != 0 &&
            "In MC/DC, NumConditions should never be zero!");
-    return Region.MCDCParams.NumConditions;
+    return NumConditions;
   }
   unsigned getNumTestVectors() const { return TV.size(); }
   bool isCondFolded(unsigned Condition) const { return Folded[Condition]; }
@@ -614,7 +618,9 @@ struct FunctionRecord {
   FunctionRecord(FunctionRecord &&FR) = default;
   FunctionRecord &operator=(FunctionRecord &&) = default;
 
-  void pushMCDCRecord(MCDCRecord Record) { MCDCRecords.push_back(Record); }
+  void pushMCDCRecord(MCDCRecord &&Record) {
+    MCDCRecords.push_back(std::move(Record));
+  }
 
   void pushRegion(CounterMappingRegion Region, uint64_t Count,
                   uint64_t FalseCount) {
diff --git a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
new file mode 100644
index 000000000000..d7520fa08242
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
@@ -0,0 +1,51 @@
+//===- MCDCTypes.h - Types related to MC/DC Coverage ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Types related to MC/DC Coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
+#define LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
+
+#include <variant>
+
+namespace llvm::coverage::mcdc {
+
+/// The ID for MCDCBranch.
+using ConditionID = unsigned int;
+
+struct DecisionParameters {
+  /// Byte Index of Bitmap Coverage Object for a Decision Region.
+  unsigned BitmapIdx;
+
+  /// Number of Conditions used for a Decision Region.
+  unsigned NumConditions;
+
+  DecisionParameters() = delete;
+  DecisionParameters(unsigned BitmapIdx, unsigned NumConditions)
+      : BitmapIdx(BitmapIdx), NumConditions(NumConditions) {}
+};
+
+struct BranchParameters {
+  /// IDs used to represent a branch region and other branch regions
+  /// evaluated based on True and False branches.
+  ConditionID ID, TrueID, FalseID;
+
+  BranchParameters() = delete;
+  BranchParameters(ConditionID ID, ConditionID TrueID, ConditionID FalseID)
+      : ID(ID), TrueID(TrueID), FalseID(FalseID) {}
+};
+
+/// The type of MC/DC-specific parameters.
+using Parameters =
+    std::variant<std::monostate, DecisionParameters, BranchParameters>;
+
+} // namespace llvm::coverage::mcdc
+
+#endif // LLVM_PROFILEDATA_COVERAGE_MCDCTYPES_H
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 53108a093bf4..91151eb5b102 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -487,6 +487,17 @@ private:
     return "** External Symbol **";
   }
 
+  // Returns the canonial name of the given PGOName. In a canonical name, all
+  // suffixes that begins with "." except ".__uniq." are stripped.
+  // FIXME: Unify this with `FunctionSamples::getCanonicalFnName`.
+  static StringRef getCanonicalName(StringRef PGOName);
+
+  // Add the function into the symbol table, by creating the following
+  // map entries:
+  // name-set = {PGOFuncName} + {getCanonicalName(PGOFuncName)} if the canonical
+  // name is different from pgo name
+  // - In MD5NameMap: <MD5Hash(name), name> for name in name-set
+  // - In MD5FuncMap: <MD5Hash(name), &F> for name in name-set
   Error addFuncWithName(Function &F, StringRef PGOFuncName);
 
   // If the symtab is created by a series of calls to \c addFuncName, \c
diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index d5e0f4031b0f..76ac7ab74a32 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2;
 /// HSA metadata major version for code object V6.
 constexpr uint32_t VersionMajorV6 = 1;
 /// HSA metadata minor version for code object V6.
-constexpr uint32_t VersionMinorV6 = 3;
+constexpr uint32_t VersionMinorV6 = 2;
 
 /// Old HSA metadata beginning assembler directive for V2. This is only used for
 /// diagnostics now.
diff --git a/llvm/include/llvm/Support/PGOOptions.h b/llvm/include/llvm/Support/PGOOptions.h
index 87eb29a8de48..de981abf1870 100644
--- a/llvm/include/llvm/Support/PGOOptions.h
+++ b/llvm/include/llvm/Support/PGOOptions.h
@@ -27,10 +27,12 @@ class FileSystem;
 struct PGOOptions {
   enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
   enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
+  enum class ColdFuncOpt { Default, OptSize, MinSize, OptNone };
   PGOOptions(std::string ProfileFile, std::string CSProfileGenFile,
              std::string ProfileRemappingFile, std::string MemoryProfile,
              IntrusiveRefCntPtr<vfs::FileSystem> FS,
              PGOAction Action = NoAction, CSPGOAction CSAction = NoCSAction,
+             ColdFuncOpt ColdType = ColdFuncOpt::Default,
              bool DebugInfoForProfiling = false,
              bool PseudoProbeForProfiling = false,
              bool AtomicCounterUpdate = false);
@@ -44,6 +46,7 @@ struct PGOOptions {
   std::string MemoryProfile;
   PGOAction Action;
   CSPGOAction CSAction;
+  ColdFuncOpt ColdOptType;
   bool DebugInfoForProfiling;
   bool PseudoProbeForProfiling;
   bool AtomicCounterUpdate;
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index abb237083d25..42cb854d9505 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -352,6 +352,9 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN)
 /// INTRINSIC readcyclecounter
 HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER)
 
+/// INTRINSIC readsteadycounter
+HANDLE_TARGET_OPCODE(G_READSTEADYCOUNTER)
+
 /// Generic load (including anyext load)
 HANDLE_TARGET_OPCODE(G_LOAD)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 2c73b67f9e1a..19197f50d9df 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1101,6 +1101,12 @@ def G_READCYCLECOUNTER : GenericInstruction {
   let hasSideEffects = true;
 }
 
+def G_READSTEADYCOUNTER : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins);
+  let hasSideEffects = true;
+}
+
 //------------------------------------------------------------------------------
 // Memory ops
 //------------------------------------------------------------------------------
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index f792237203b4..b1f3c500a1b6 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -130,6 +130,7 @@ let IfConvergent = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS in {
 }
 
 def : GINodeEquiv<G_BR, br>;
+def : GINodeEquiv<G_BRCOND, brcond>;
 def : GINodeEquiv<G_BSWAP, bswap>;
 def : GINodeEquiv<G_BITREVERSE, bitreverse>;
 def : GINodeEquiv<G_FSHL, fshl>;
@@ -167,6 +168,7 @@ def : GINodeEquiv<G_FMAXNUM_IEEE, fmaxnum_ieee>;
 def : GINodeEquiv<G_FMAXIMUM, fmaximum>;
 def : GINodeEquiv<G_FMINIMUM, fminimum>;
 def : GINodeEquiv<G_READCYCLECOUNTER, readcyclecounter>;
+def : GINodeEquiv<G_READSTEADYCOUNTER, readsteadycounter>;
 def : GINodeEquiv<G_ROTR, rotr>;
 def : GINodeEquiv<G_ROTL, rotl>;
 def : GINodeEquiv<G_LROUND, lround>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 22360353790d..5f8bf0d44810 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -657,6 +657,9 @@ def prefetch   : SDNode<"ISD::PREFETCH"   , SDTPrefetch,
 def readcyclecounter : SDNode<"ISD::READCYCLECOUNTER", SDTIntLeaf,
                      [SDNPHasChain, SDNPSideEffect]>;
 
+def readsteadycounter : SDNode<"ISD::READSTEADYCOUNTER", SDTIntLeaf,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
 def membarrier : SDNode<"ISD::MEMBARRIER", SDTNone,
                         [SDNPHasChain, SDNPSideEffect]>;
 
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index cce9d6db260d..ed9944bcef23 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -805,6 +805,12 @@ inline constexpr CpuInfo CpuInfos[] = {
          {AArch64::AEK_FP16, AArch64::AEK_RAND, AArch64::AEK_SM4,
           AArch64::AEK_SHA3, AArch64::AEK_SHA2, AArch64::AEK_AES,
           AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS}))},
+    {"ampere1b", ARMV8_7A,
+     (AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_RAND,
+                                AArch64::AEK_SM4, AArch64::AEK_SHA3,
+                                AArch64::AEK_SHA2, AArch64::AEK_AES,
+                                AArch64::AEK_MTE, AArch64::AEK_SB,
+                                AArch64::AEK_SSBS, AArch64::AEK_CSSC}))},
 };
 
 // An alias for a CPU.
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index c42d66f048fc..ec3817134a5a 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -258,6 +258,7 @@ uint64_t parseArchExt(StringRef ArchExt);
 ArchKind parseCPUArch(StringRef CPU);
 ProfileKind parseArchProfile(StringRef Arch);
 unsigned parseArchVersion(StringRef Arch);
+unsigned parseArchMinorVersion(StringRef Arch);
 
 void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 6464285980f0..7da11701fefb 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -111,6 +111,14 @@ enum GPUKind : uint32_t {
 
   GK_AMDGCN_FIRST = GK_GFX600,
   GK_AMDGCN_LAST = GK_GFX1201,
+
+  GK_GFX9_GENERIC = 192,
+  GK_GFX10_1_GENERIC = 193,
+  GK_GFX10_3_GENERIC = 194,
+  GK_GFX11_GENERIC = 195,
+
+  GK_AMDGCN_GENERIC_FIRST = GK_GFX9_GENERIC,
+  GK_AMDGCN_GENERIC_LAST = GK_GFX11_GENERIC,
 };
 
 /// Instruction set architecture version.
@@ -147,6 +155,8 @@ enum ArchFeatureKind : uint32_t {
   FEATURE_WGP = 1 << 9,
 };
 
+StringRef getArchFamilyNameAMDGCN(GPUKind AK);
+
 StringRef getArchNameAMDGCN(GPUKind AK);
 StringRef getArchNameR600(GPUKind AK);
 StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 98d8490cc9f7..e73207021392 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -1035,10 +1035,7 @@ public:
 
   /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC
   /// is enabled by default.
-  bool hasDefaultTLSDESC() const {
-    // TODO: Improve check for other platforms, like Android, and RISC-V
-    return false;
-  }
+  bool hasDefaultTLSDESC() const { return isAndroid() && isRISCV64(); }
 
   /// Tests whether the target uses -data-sections as default.
   bool hasDefaultDataSections() const {
diff --git a/llvm/include/llvm/TextAPI/FileTypes.h b/llvm/include/llvm/TextAPI/FileTypes.h
new file mode 100644
index 000000000000..5876e9d5a530
--- /dev/null
+++ b/llvm/include/llvm/TextAPI/FileTypes.h
@@ -0,0 +1,49 @@
+//===- llvm/TextAPI/FileTypes.h - TAPI Interface File -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TEXTAPI_FILETYPES_H
+#define LLVM_TEXTAPI_FILETYPES_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+namespace llvm::MachO {
+/// Defines the file type TextAPI files can represent.
+enum FileType : unsigned {
+  /// Invalid file type.
+  Invalid = 0U,
+
+  /// \brief MachO Dynamic Library file.
+  MachO_DynamicLibrary = 1U << 0,
+
+  /// \brief MachO Dynamic Library Stub file.
+  MachO_DynamicLibrary_Stub = 1U << 1,
+
+  /// \brief MachO Bundle file.
+  MachO_Bundle = 1U << 2,
+
+  /// Text-based stub file (.tbd) version 1.0
+  TBD_V1 = 1U << 3,
+
+  /// Text-based stub file (.tbd) version 2.0
+  TBD_V2 = 1U << 4,
+
+  /// Text-based stub file (.tbd) version 3.0
+  TBD_V3 = 1U << 5,
+
+  /// Text-based stub file (.tbd) version 4.0
+  TBD_V4 = 1U << 6,
+
+  /// Text-based stub file (.tbd) version 5.0
+  TBD_V5 = 1U << 7,
+
+  All = ~0U,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All),
+};
+
+} // namespace llvm::MachO
+#endif // LLVM_TEXTAPI_FILETYPES_H
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 7984a8008771..10a37e3a0c2f 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -14,14 +14,15 @@
 #ifndef LLVM_TEXTAPI_INTERFACEFILE_H
 #define LLVM_TEXTAPI_INTERFACEFILE_H
 
-#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
+#include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Platform.h"
+#include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/Symbol.h"
 #include "llvm/TextAPI/SymbolSet.h"
 #include "llvm/TextAPI/Target.h"
@@ -47,44 +48,6 @@ enum class ObjCConstraintType : unsigned {
   GC = 4,
 };
 
-// clang-format off
-
-/// Defines the file type this file represents.
-enum FileType : unsigned {
-  /// Invalid file type.
-  Invalid = 0U,
-
-  /// \brief MachO Dynamic Library file.
-  MachO_DynamicLibrary      = 1U <<  0,
-
-  /// \brief MachO Dynamic Library Stub file.
-  MachO_DynamicLibrary_Stub = 1U <<  1,
-
-  /// \brief MachO Bundle file.
-  MachO_Bundle              = 1U <<  2,
-
-  /// Text-based stub file (.tbd) version 1.0
-  TBD_V1                    = 1U <<  3,
-
-  /// Text-based stub file (.tbd) version 2.0
-  TBD_V2                    = 1U <<  4,
-
-  /// Text-based stub file (.tbd) version 3.0
-  TBD_V3                    = 1U <<  5,
-
-  /// Text-based stub file (.tbd) version 4.0
-  TBD_V4                    = 1U <<  6,
-
-  /// Text-based stub file (.tbd) version 5.0
-  TBD_V5                    = 1U <<  7,
-
-  All                       = ~0U,
-
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All),
-};
-
-// clang-format on
-
 /// Reference to an interface file.
 class InterfaceFileRef {
 public:
@@ -437,6 +400,14 @@ public:
   void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
                      bool Overwrite = false);
 
+  /// Set InterfaceFile properties from pre-gathered binary attributes,
+  /// if they are not set already.
+  ///
+  /// \param BA Attributes typically represented in load commands.
+  /// \param Targ MachO Target slice to add attributes to.
+  void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                          const Target &Targ);
+
   /// The equality is determined by attributes that impact linking
   /// compatibilities. Path, & FileKind are irrelevant since these by
   /// itself should not impact linking.
diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h
index f3db72121e68..5b214d0bfff5 100644
--- a/llvm/include/llvm/TextAPI/RecordsSlice.h
+++ b/llvm/include/llvm/TextAPI/RecordsSlice.h
@@ -15,7 +15,7 @@
 #define LLVM_TEXTAPI_RECORDSLICE_H
 
 #include "llvm/Support/Allocator.h"
-#include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/RecordVisitor.h"
@@ -191,6 +191,7 @@ private:
 };
 
 using Records = llvm::SmallVector<std::shared_ptr<RecordsSlice>, 4>;
+class InterfaceFile;
 std::unique_ptr<InterfaceFile> convertToInterfaceFile(const Records &Slices);
 
 } // namespace MachO
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h b/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h
new file mode 100644
index 000000000000..785448e0dec4
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h
@@ -0,0 +1,29 @@
+//===- PGOForceFunctionAttrs.h - --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/PGOOptions.h"
+
+namespace llvm {
+
+struct PGOForceFunctionAttrsPass
+    : public PassInfoMixin<PGOForceFunctionAttrsPass> {
+  PGOForceFunctionAttrsPass(PGOOptions::ColdFuncOpt ColdType)
+      : ColdType(ColdType) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  PGOOptions::ColdFuncOpt ColdType;
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_PGOFORCEFUNCTIONATTRS_H
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
new file mode 100644
index 000000000000..61786227d7a3
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
@@ -0,0 +1,24 @@
+//===- JumpTableToSwitch.h - ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
+#define LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+struct JumpTableToSwitchPass : PassInfoMixin<JumpTableToSwitchPass> {
+  /// Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_JUMP_TABLE_TO_SWITCH_H
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h b/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
deleted file mode 100644
index 496e8df85ea0..000000000000
--- a/llvm/include/llvm/Transforms/Scalar/LoopReroll.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===- LoopReroll.h - Loop rerolling pass ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
-#define LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
-
-#include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-
-namespace llvm {
-
-class LoopRerollPass : public PassInfoMixin<LoopRerollPass> {
-public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
-                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_SCALAR_LOOPREROLL_H
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 19c4393add6a..8dfc4b2a1cb1 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -188,6 +188,12 @@ static bool isObjectSize(const Value *V, TypeSize Size, const DataLayout &DL,
   return ObjectSize && *ObjectSize == Size;
 }
 
+/// Return true if both V1 and V2 are VScale
+static bool areBothVScale(const Value *V1, const Value *V2) {
+  return PatternMatch::match(V1, PatternMatch::m_VScale()) &&
+         PatternMatch::match(V2, PatternMatch::m_VScale());
+}
+
 //===----------------------------------------------------------------------===//
 // CaptureInfo implementations
 //===----------------------------------------------------------------------===//
@@ -679,7 +685,8 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
       for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
-        if (Decomposed.VarIndices[i].Val.V == LE.Val.V &&
+        if ((Decomposed.VarIndices[i].Val.V == LE.Val.V ||
+             areBothVScale(Decomposed.VarIndices[i].Val.V, LE.Val.V)) &&
             Decomposed.VarIndices[i].Val.hasSameCastsAs(LE.Val)) {
           Scale += Decomposed.VarIndices[i].Scale;
           LE.IsNSW = false; // We cannot guarantee nsw for the merge.
@@ -1170,6 +1177,37 @@ AliasResult BasicAAResult::aliasGEP(
     }
   }
 
+  // VScale Alias Analysis - Given one scalable offset between accesses and a
+  // scalable typesize, we can divide each side by vscale, treating both values
+  // as a constant. We prove that Offset/vscale >= TypeSize/vscale.
+  if (DecompGEP1.VarIndices.size() == 1 &&
+      DecompGEP1.VarIndices[0].Val.TruncBits == 0 &&
+      DecompGEP1.Offset.isZero() &&
+      PatternMatch::match(DecompGEP1.VarIndices[0].Val.V,
+                          PatternMatch::m_VScale())) {
+    const VariableGEPIndex &ScalableVar = DecompGEP1.VarIndices[0];
+    APInt Scale =
+        ScalableVar.IsNegated ? -ScalableVar.Scale : ScalableVar.Scale;
+    LocationSize VLeftSize = Scale.isNegative() ? V1Size : V2Size;
+
+    // Check if the offset is known to not overflow, if it does then attempt to
+    // prove it with the known values of vscale_range.
+    bool Overflows = !DecompGEP1.VarIndices[0].IsNSW;
+    if (Overflows) {
+      ConstantRange CR = getVScaleRange(&F, Scale.getBitWidth());
+      (void)CR.getSignedMax().smul_ov(Scale, Overflows);
+    }
+
+    if (!Overflows) {
+      // Note that we do not check that the typesize is scalable, as vscale >= 1
+      // so noalias still holds so long as the dependency distance is at least
+      // as big as the typesize.
+      if (VLeftSize.hasValue() &&
+          Scale.abs().uge(VLeftSize.getValue().getKnownMinValue()))
+        return AliasResult::NoAlias;
+    }
+  }
+
   // Bail on analysing scalable LocationSize
   if (V1Size.isScalable() || V2Size.isScalable())
     return AliasResult::MayAlias;
@@ -1761,7 +1799,8 @@ void BasicAAResult::subtractDecomposedGEPs(DecomposedGEP &DestGEP,
     bool Found = false;
     for (auto I : enumerate(DestGEP.VarIndices)) {
       VariableGEPIndex &Dest = I.value();
-      if (!isValueEqualInPotentialCycles(Dest.Val.V, Src.Val.V, AAQI) ||
+      if ((!isValueEqualInPotentialCycles(Dest.Val.V, Src.Val.V, AAQI) &&
+           !areBothVScale(Dest.Val.V, Src.Val.V)) ||
           !Dest.Val.hasSameCastsAs(Src.Val))
         continue;
 
diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp
index c7f4cab41588..c07a8a76f111 100644
--- a/llvm/lib/Analysis/DomConditionCache.cpp
+++ b/llvm/lib/Analysis/DomConditionCache.cpp
@@ -34,23 +34,44 @@ static void findAffectedValues(Value *Cond,
     }
   };
 
-  ICmpInst::Predicate Pred;
-  Value *A;
-  if (match(Cond, m_ICmp(Pred, m_Value(A), m_Constant()))) {
-    AddAffected(A);
+  bool TopLevelIsAnd = match(Cond, m_LogicalAnd());
+  SmallVector<Value *, 8> Worklist;
+  SmallPtrSet<Value *, 8> Visited;
+  Worklist.push_back(Cond);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
 
-    if (ICmpInst::isEquality(Pred)) {
-      Value *X;
-      // (X & C) or (X | C) or (X ^ C).
-      // (X << C) or (X >>_s C) or (X >>_u C).
-      if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
-          match(A, m_Shift(m_Value(X), m_ConstantInt())))
-        AddAffected(X);
-    } else {
-      Value *X;
-      // Handle (A + C1) u< C2, which is the canonical form of A > C3 && A < C4.
-      if (match(A, m_Add(m_Value(X), m_ConstantInt())))
-        AddAffected(X);
+    CmpInst::Predicate Pred;
+    Value *A, *B;
+    // Only recurse into and/or if it matches the top-level and/or type.
+    if (TopLevelIsAnd ? match(V, m_LogicalAnd(m_Value(A), m_Value(B)))
+                      : match(V, m_LogicalOr(m_Value(A), m_Value(B)))) {
+      Worklist.push_back(A);
+      Worklist.push_back(B);
+    } else if (match(V, m_ICmp(Pred, m_Value(A), m_Constant()))) {
+      AddAffected(A);
+
+      if (ICmpInst::isEquality(Pred)) {
+        Value *X;
+        // (X & C) or (X | C) or (X ^ C).
+        // (X << C) or (X >>_s C) or (X >>_u C).
+        if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
+            match(A, m_Shift(m_Value(X), m_ConstantInt())))
+          AddAffected(X);
+      } else {
+        Value *X;
+        // Handle (A + C1) u< C2, which is the canonical form of
+        // A > C3 && A < C4.
+        if (match(A, m_Add(m_Value(X), m_ConstantInt())))
+          AddAffected(X);
+      }
+    } else if (match(Cond, m_CombineOr(m_FCmp(Pred, m_Value(A), m_Constant()),
+                                       m_Intrinsic<Intrinsic::is_fpclass>(
+                                           m_Value(A), m_Constant())))) {
+      // Handle patterns that computeKnownFPClass() support.
+      AddAffected(A);
     }
   }
 }
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 5b780b520747..e55eaa55f8e9 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -336,8 +336,8 @@ protected:
 
   /// Called at the end of processing a switch instruction, with the given
   /// number of case clusters.
-  virtual void onFinalizeSwitch(unsigned JumpTableSize,
-                                unsigned NumCaseCluster) {}
+  virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                                bool DefaultDestUndefined) {}
 
   /// Called to account for any other instruction not specifically accounted
   /// for.
@@ -699,15 +699,16 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
                                        CallPenalty));
   }
 
-  void onFinalizeSwitch(unsigned JumpTableSize,
-                        unsigned NumCaseCluster) override {
+  void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                        bool DefaultDestUndefined) override {
+    if (!DefaultDestUndefined)
+      addCost(2 * InstrCost);
     // If suitable for a jump table, consider the cost for the table size and
     // branch to destination.
     // Maximum valid cost increased in this function.
     if (JumpTableSize) {
       int64_t JTCost =
           static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
-
       addCost(JTCost);
       return;
     }
@@ -1153,6 +1154,7 @@ private:
   // heuristics in the ML inliner.
   static constexpr int JTCostMultiplier = 4;
   static constexpr int CaseClusterCostMultiplier = 2;
+  static constexpr int SwitchDefaultDestCostMultiplier = 2;
   static constexpr int SwitchCostMultiplier = 2;
 
   // FIXME: These are taken from the heuristic-based cost visitor: we should
@@ -1231,8 +1233,11 @@ private:
     }
   }
 
-  void onFinalizeSwitch(unsigned JumpTableSize,
-                        unsigned NumCaseCluster) override {
+  void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
+                        bool DefaultDestUndefined) override {
+    if (!DefaultDestUndefined)
+      increment(InlineCostFeatureIndex::switch_default_dest_penalty,
+                SwitchDefaultDestCostMultiplier * InstrCost);
 
     if (JumpTableSize) {
       int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
@@ -2461,7 +2466,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
-  onFinalizeSwitch(JumpTableSize, NumCaseCluster);
+  onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
   return false;
 }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 01b017142cfc..51e258d69e9e 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1853,35 +1853,36 @@ static Value *simplifyAndOrOfFCmps(const SimplifyQuery &Q, FCmpInst *LHS,
     return nullptr;
 
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-  if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
-      (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
-    // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y
-    // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X
-    // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y
-    // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X
-    // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y
-    // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X
-    // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y
-    // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X
+  if ((PredL == FCmpInst::FCMP_ORD || PredL == FCmpInst::FCMP_UNO) &&
+      ((FCmpInst::isOrdered(PredR) && IsAnd) ||
+       (FCmpInst::isUnordered(PredR) && !IsAnd))) {
+    // (fcmp ord X, NNAN) & (fcmp o** X, Y) --> fcmp o** X, Y
+    // (fcmp uno X, NNAN) & (fcmp o** X, Y) --> false
+    // (fcmp uno X, NNAN) | (fcmp u** X, Y) --> fcmp u** X, Y
+    // (fcmp ord X, NNAN) | (fcmp u** X, Y) --> true
     if (((LHS1 == RHS0 || LHS1 == RHS1) &&
          isKnownNeverNaN(LHS0, /*Depth=*/0, Q)) ||
         ((LHS0 == RHS0 || LHS0 == RHS1) &&
          isKnownNeverNaN(LHS1, /*Depth=*/0, Q)))
-      return RHS;
-
-    // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y
-    // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X
-    // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y
-    // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X
-    // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y
-    // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X
-    // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y
-    // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X
+      return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
+                 ? static_cast<Value *>(RHS)
+                 : ConstantInt::getBool(LHS->getType(), !IsAnd);
+  }
+
+  if ((PredR == FCmpInst::FCMP_ORD || PredR == FCmpInst::FCMP_UNO) &&
+      ((FCmpInst::isOrdered(PredL) && IsAnd) ||
+       (FCmpInst::isUnordered(PredL) && !IsAnd))) {
+    // (fcmp o** X, Y) & (fcmp ord X, NNAN) --> fcmp o** X, Y
+    // (fcmp o** X, Y) & (fcmp uno X, NNAN) --> false
+    // (fcmp u** X, Y) | (fcmp uno X, NNAN) --> fcmp u** X, Y
+    // (fcmp u** X, Y) | (fcmp ord X, NNAN) --> true
     if (((RHS1 == LHS0 || RHS1 == LHS1) &&
          isKnownNeverNaN(RHS0, /*Depth=*/0, Q)) ||
         ((RHS0 == LHS0 || RHS0 == LHS1) &&
          isKnownNeverNaN(RHS1, /*Depth=*/0, Q)))
-      return LHS;
+      return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
+                 ? static_cast<Value *>(LHS)
+                 : ConstantInt::getBool(LHS->getType(), !IsAnd);
   }
 
   return nullptr;
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e4dc1a867f6f..d05f42552e81 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -817,3 +817,18 @@ MDNode *AAMDNodes::extendToTBAA(MDNode *MD, ssize_t Len) {
       ConstantAsMetadata::get(ConstantInt::get(PreviousSize->getType(), Len));
   return MDNode::get(MD->getContext(), NextNodes);
 }
+
+AAMDNodes AAMDNodes::adjustForAccess(unsigned AccessSize) {
+  AAMDNodes New = *this;
+  MDNode *M = New.TBAAStruct;
+  New.TBAAStruct = nullptr;
+  if (M && M->getNumOperands() == 3 && M->getOperand(0) &&
+      mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+      mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+      M->getOperand(1) && mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+      mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+          AccessSize &&
+      M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+    New.TBAA = cast<MDNode>(M->getOperand(2));
+  return New;
+}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 58db81f47013..92c9162a1f8f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -706,28 +706,40 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
   }
 }
 
+static void computeKnownBitsFromCond(const Value *V, Value *Cond,
+                                     KnownBits &Known, unsigned Depth,
+                                     const SimplifyQuery &SQ, bool Invert) {
+  Value *A, *B;
+  if (Depth < MaxAnalysisRecursionDepth &&
+      (Invert ? match(Cond, m_LogicalOr(m_Value(A), m_Value(B)))
+              : match(Cond, m_LogicalAnd(m_Value(A), m_Value(B))))) {
+    computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, Invert);
+    computeKnownBitsFromCond(V, B, Known, Depth + 1, SQ, Invert);
+  }
+
+  if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
+    computeKnownBitsFromCmp(
+        V, Invert ? Cmp->getInversePredicate() : Cmp->getPredicate(),
+        Cmp->getOperand(0), Cmp->getOperand(1), Known, SQ);
+}
+
 void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known,
-                                      unsigned Depth, const SimplifyQuery &Q) {
+                                       unsigned Depth, const SimplifyQuery &Q) {
   if (!Q.CxtI)
     return;
 
   if (Q.DC && Q.DT) {
     // Handle dominating conditions.
     for (BranchInst *BI : Q.DC->conditionsFor(V)) {
-      auto *Cmp = dyn_cast<ICmpInst>(BI->getCondition());
-      if (!Cmp)
-        continue;
-
       BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0));
       if (Q.DT->dominates(Edge0, Q.CxtI->getParent()))
-        computeKnownBitsFromCmp(V, Cmp->getPredicate(), Cmp->getOperand(0),
-                                Cmp->getOperand(1), Known, Q);
+        computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q,
+                                 /*Invert*/ false);
 
       BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1));
       if (Q.DT->dominates(Edge1, Q.CxtI->getParent()))
-        computeKnownBitsFromCmp(V, Cmp->getInversePredicate(),
-                                Cmp->getOperand(0), Cmp->getOperand(1), Known,
-                                Q);
+        computeKnownBitsFromCond(V, BI->getCondition(), Known, Depth, Q,
+                                 /*Invert*/ true);
     }
 
     if (Known.hasConflict())
@@ -4213,9 +4225,53 @@ llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
   return fcmpImpliesClass(Pred, F, LHS, *ConstRHS, LookThroughSrc);
 }
 
-static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
-                                                  const SimplifyQuery &Q) {
-  FPClassTest KnownFromAssume = fcAllFlags;
+static void computeKnownFPClassFromCond(const Value *V, Value *Cond,
+                                        bool CondIsTrue,
+                                        const Instruction *CxtI,
+                                        KnownFPClass &KnownFromContext) {
+  CmpInst::Predicate Pred;
+  Value *LHS;
+  uint64_t ClassVal = 0;
+  const APFloat *CRHS;
+  // TODO: handle sign-bit check idiom
+  if (match(Cond, m_FCmp(Pred, m_Value(LHS), m_APFloat(CRHS)))) {
+    auto [CmpVal, MaskIfTrue, MaskIfFalse] = fcmpImpliesClass(
+        Pred, *CxtI->getParent()->getParent(), LHS, *CRHS, LHS != V);
+    if (CmpVal == V)
+      KnownFromContext.knownNot(~(CondIsTrue ? MaskIfTrue : MaskIfFalse));
+  } else if (match(Cond, m_Intrinsic<Intrinsic::is_fpclass>(
+                             m_Value(LHS), m_ConstantInt(ClassVal)))) {
+    FPClassTest Mask = static_cast<FPClassTest>(ClassVal);
+    KnownFromContext.knownNot(CondIsTrue ? ~Mask : Mask);
+  }
+}
+
+static KnownFPClass computeKnownFPClassFromContext(const Value *V,
+                                                   const SimplifyQuery &Q) {
+  KnownFPClass KnownFromContext;
+
+  if (!Q.CxtI)
+    return KnownFromContext;
+
+  if (Q.DC && Q.DT) {
+    // Handle dominating conditions.
+    for (BranchInst *BI : Q.DC->conditionsFor(V)) {
+      Value *Cond = BI->getCondition();
+
+      BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0));
+      if (Q.DT->dominates(Edge0, Q.CxtI->getParent()))
+        computeKnownFPClassFromCond(V, Cond, /*CondIsTrue=*/true, Q.CxtI,
+                                    KnownFromContext);
+
+      BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1));
+      if (Q.DT->dominates(Edge1, Q.CxtI->getParent()))
+        computeKnownFPClassFromCond(V, Cond, /*CondIsTrue=*/false, Q.CxtI,
+                                    KnownFromContext);
+    }
+  }
+
+  if (!Q.AC)
+    return KnownFromContext;
 
   // Try to restrict the floating-point classes based on information from
   // assumptions.
@@ -4223,9 +4279,8 @@ static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
-    const Function *F = I->getFunction();
 
-    assert(F == Q.CxtI->getParent()->getParent() &&
+    assert(I->getFunction() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
     assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
@@ -4233,25 +4288,11 @@ static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
     if (!isValidAssumeForContext(I, Q.CxtI, Q.DT))
       continue;
 
-    CmpInst::Predicate Pred;
-    Value *LHS, *RHS;
-    uint64_t ClassVal = 0;
-    if (match(I->getArgOperand(0), m_FCmp(Pred, m_Value(LHS), m_Value(RHS)))) {
-      const APFloat *CRHS;
-      if (match(RHS, m_APFloat(CRHS))) {
-        auto [CmpVal, MaskIfTrue, MaskIfFalse] =
-            fcmpImpliesClass(Pred, *F, LHS, *CRHS, LHS != V);
-        if (CmpVal == V)
-          KnownFromAssume &= MaskIfTrue;
-      }
-    } else if (match(I->getArgOperand(0),
-                     m_Intrinsic<Intrinsic::is_fpclass>(
-                         m_Value(LHS), m_ConstantInt(ClassVal)))) {
-      KnownFromAssume &= static_cast<FPClassTest>(ClassVal);
-    }
+    computeKnownFPClassFromCond(V, I->getArgOperand(0), /*CondIsTrue=*/true,
+                                Q.CxtI, KnownFromContext);
   }
 
-  return KnownFromAssume;
+  return KnownFromContext;
 }
 
 void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
@@ -4359,10 +4400,8 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
       KnownNotFromFlags |= fcInf;
   }
 
-  if (Q.AC) {
-    FPClassTest AssumedClasses = computeKnownFPClassFromAssumes(V, Q);
-    KnownNotFromFlags |= ~AssumedClasses;
-  }
+  KnownFPClass AssumedClasses = computeKnownFPClassFromContext(V, Q);
+  KnownNotFromFlags |= ~AssumedClasses.KnownFPClasses;
 
   // We no longer need to find out about these bits from inputs if we can
   // assume this from flags/attributes.
@@ -4370,6 +4409,12 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
   auto ClearClassesFromFlags = make_scope_exit([=, &Known] {
     Known.knownNot(KnownNotFromFlags);
+    if (!Known.SignBit && AssumedClasses.SignBit) {
+      if (*AssumedClasses.SignBit)
+        Known.signBitMustBeOne();
+      else
+        Known.signBitMustBeZero();
+    }
   });
 
   if (!Op)
@@ -5271,7 +5316,8 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
       bool First = true;
 
-      for (Value *IncValue : P->incoming_values()) {
+      for (const Use &U : P->operands()) {
+        Value *IncValue = U.get();
         // Skip direct self references.
         if (IncValue == P)
           continue;
@@ -5280,8 +5326,10 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         // Recurse, but cap the recursion to two levels, because we don't want
         // to waste time spinning around in loops. We need at least depth 2 to
         // detect known sign bits.
-        computeKnownFPClass(IncValue, DemandedElts, InterestedClasses, KnownSrc,
-                            PhiRecursionLimit, Q);
+        computeKnownFPClass(
+            IncValue, DemandedElts, InterestedClasses, KnownSrc,
+            PhiRecursionLimit,
+            Q.getWithInstruction(P->getIncomingBlock(U)->getTerminator()));
 
         if (First) {
           Known = KnownSrc;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b961fc2c338a..e89a1c26c23e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2127,7 +2127,7 @@ void AsmPrinter::emitGlobalGOTEquivs() {
     emitGlobalVariable(GV);
 }
 
-void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
+void AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
   MCSymbol *Name = getSymbol(&GA);
   bool IsFunction = GA.getValueType()->isFunctionTy();
   // Treat bitcasts of functions as functions also. This is important at least
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5383b15c1c7f..32a25b49b4e4 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2522,8 +2522,40 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   return false;
 }
 
+static bool isIntrinsicOrLFToBeTailCalled(const TargetLibraryInfo *TLInfo,
+                                          const CallInst *CI) {
+  assert(CI && CI->use_empty());
+
+  if (const auto *II = dyn_cast<IntrinsicInst>(CI))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+      return true;
+    default:
+      return false;
+    }
+
+  LibFunc LF;
+  Function *Callee = CI->getCalledFunction();
+  if (Callee && TLInfo && TLInfo->getLibFunc(*Callee, LF))
+    switch (LF) {
+    case LibFunc_strcpy:
+    case LibFunc_strncpy:
+    case LibFunc_strcat:
+    case LibFunc_strncat:
+      return true;
+    default:
+      return false;
+    }
+
+  return false;
+}
+
 /// Look for opportunities to duplicate return instructions to the predecessor
-/// to enable tail call optimizations. The case it is currently looking for is:
+/// to enable tail call optimizations. The case it is currently looking for is
+/// the following one. Known intrinsics or library function that may be tail
+/// called are taken into account as well.
 /// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
@@ -2580,8 +2612,6 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     }
 
     PN = dyn_cast<PHINode>(V);
-    if (!PN)
-      return false;
   }
 
   if (PN && PN->getParent() != BB)
@@ -2620,8 +2650,30 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
       // Make sure the phi value is indeed produced by the tail call.
       if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
           TLI->mayBeEmittedAsTailCall(CI) &&
-          attributesPermitTailCall(F, CI, RetI, *TLI))
+          attributesPermitTailCall(F, CI, RetI, *TLI)) {
         TailCallBBs.push_back(PredBB);
+      } else {
+        // Consider the cases in which the phi value is indirectly produced by
+        // the tail call, for example when encountering memset(), memmove(),
+        // strcpy(), whose return value may have been optimized out. In such
+        // cases, the value needs to be the first function argument.
+        //
+        // bb0:
+        //   tail call void @llvm.memset.p0.i64(ptr %0, i8 0, i64 %1)
+        //   br label %return
+        // return:
+        //   %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]
+        if (PredBB && PredBB->getSingleSuccessor() == BB)
+          CI = dyn_cast_or_null<CallInst>(
+              PredBB->getTerminator()->getPrevNonDebugInstruction(true));
+
+        if (CI && CI->use_empty() &&
+            isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+            IncomingVal == CI->getArgOperand(0) &&
+            TLI->mayBeEmittedAsTailCall(CI) &&
+            attributesPermitTailCall(F, CI, RetI, *TLI))
+          TailCallBBs.push_back(PredBB);
+      }
     }
   } else {
     SmallPtrSet<BasicBlock *, 4> VisitedBBs;
@@ -2631,8 +2683,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
       if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {
         CallInst *CI = dyn_cast<CallInst>(I);
         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
-            attributesPermitTailCall(F, CI, RetI, *TLI))
-          TailCallBBs.push_back(Pred);
+            attributesPermitTailCall(F, CI, RetI, *TLI)) {
+          // Either we return void or the return value must be the first
+          // argument of a known intrinsic or library function.
+          if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+                     V == CI->getArgOperand(0))) {
+            TailCallBBs.push_back(Pred);
+          }
+        }
       }
     }
   }
@@ -8455,7 +8513,8 @@ bool CodeGenPrepare::fixupDPValuesOnInst(Instruction &I) {
 // FIXME: should updating debug-info really cause the "changed" flag to fire,
 // which can cause a function to be reprocessed?
 bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
-  if (DPV.Type != DPValue::LocationType::Value)
+  if (DPV.Type != DPValue::LocationType::Value &&
+      DPV.Type != DPValue::LocationType::Assign)
     return false;
 
   // Does this DPValue refer to a sunk address calculation?
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dd38317c26bf..311dd9d9739a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1885,6 +1885,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_INTRINSIC_TRUNC;
     case Intrinsic::readcyclecounter:
       return TargetOpcode::G_READCYCLECOUNTER;
+    case Intrinsic::readsteadycounter:
+      return TargetOpcode::G_READSTEADYCOUNTER;
     case Intrinsic::ptrmask:
       return TargetOpcode::G_PTRMASK;
     case Intrinsic::lrint:
@@ -2120,6 +2122,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                                 ListSize, Alignment));
     return true;
   }
+  case Intrinsic::dbg_assign:
+    // A dbg.assign is a dbg.value with more information about stack locations,
+    // typically produced during optimisation of variables with leaked
+    // addresses. We can treat it like a normal dbg_value intrinsic here; to
+    // benefit from the full analysis of stack/SSA locations, GlobalISel would
+    // need to register for and use the AssignmentTrackingAnalysis pass.
+    LLVM_FALLTHROUGH;
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 464ff0864d14..e39fdae1ccbe 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1716,8 +1716,7 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
   Register NewVal = Val;
 
   assert(Ty.isVector());
-  LLT EltTy = Ty.getElementType();
-  if (EltTy.isPointer())
+  if (Ty.isPointerVector())
     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
 }
@@ -7964,7 +7963,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
       MI.getFirst4RegLLTs();
 
-  bool IsEltPtr = DstTy.getScalarType().isPointer();
+  bool IsEltPtr = DstTy.isPointerOrPointerVector();
   if (IsEltPtr) {
     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index a5827c26c04f..d58b62871817 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -199,7 +199,7 @@ void MachineIRBuilder::validateShiftOp(const LLT Res, const LLT Op0,
 MachineInstrBuilder
 MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0,
                               const SrcOp &Op1, std::optional<unsigned> Flags) {
-  assert(Res.getLLTTy(*getMRI()).getScalarType().isPointer() &&
+  assert(Res.getLLTTy(*getMRI()).isPointerOrPointerVector() &&
          Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
   assert(Op1.getLLTTy(*getMRI()).getScalarType().isScalar() && "invalid offset type");
 
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 61920a0e04ab..fe450cba4a33 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -312,6 +312,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
     break;
   }
+  case Intrinsic::readsteadycounter: {
+    errs() << "WARNING: this target does not support the llvm.readsteadycounter"
+           << " intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
+    break;
+  }
 
   case Intrinsic::dbg_declare:
   case Intrinsic::dbg_label:
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c65e91741533..2632b5b9feac 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1288,10 +1288,10 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !PtrTy.isValid() || !OffsetTy.isValid())
       break;
 
-    if (!PtrTy.getScalarType().isPointer())
+    if (!PtrTy.isPointerOrPointerVector())
       report("gep first operand must be a pointer", MI);
 
-    if (OffsetTy.getScalarType().isPointer())
+    if (OffsetTy.isPointerOrPointerVector())
       report("gep offset operand must not be a pointer", MI);
 
     // TODO: Is the offset allowed to be a scalar with a vector?
@@ -1304,7 +1304,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !SrcTy.isValid() || !MaskTy.isValid())
       break;
 
-    if (!DstTy.getScalarType().isPointer())
+    if (!DstTy.isPointerOrPointerVector())
       report("ptrmask result type must be a pointer", MI);
 
     if (!MaskTy.getScalarType().isScalar())
@@ -1330,15 +1330,13 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (!DstTy.isValid() || !SrcTy.isValid())
       break;
 
-    LLT DstElTy = DstTy.getScalarType();
-    LLT SrcElTy = SrcTy.getScalarType();
-    if (DstElTy.isPointer() || SrcElTy.isPointer())
+    if (DstTy.isPointerOrPointerVector() || SrcTy.isPointerOrPointerVector())
       report("Generic extend/truncate can not operate on pointers", MI);
 
     verifyVectorElementMatch(DstTy, SrcTy, MI);
 
-    unsigned DstSize = DstElTy.getSizeInBits();
-    unsigned SrcSize = SrcElTy.getSizeInBits();
+    unsigned DstSize = DstTy.getScalarSizeInBits();
+    unsigned SrcSize = SrcTy.getScalarSizeInBits();
     switch (MI->getOpcode()) {
     default:
       if (DstSize <= SrcSize)
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index cbb1a74049fb..7e9c992031f8 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -236,7 +236,8 @@ namespace {
     /// was successfully coalesced away. If it is not currently possible to
     /// coalesce this interval, but it may be possible if other things get
     /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again);
+    bool joinCopy(MachineInstr *CopyMI, bool &Again,
+                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
 
     /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
@@ -1964,7 +1965,9 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
-bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(
+    MachineInstr *CopyMI, bool &Again,
+    SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
   Again = false;
   LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
 
@@ -2156,7 +2159,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   // CopyMI has been erased by joinIntervals at this point. Remove it from
   // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
   // to the work list. This keeps ErasedInstrs from growing needlessly.
-  ErasedInstrs.erase(CopyMI);
+  if (ErasedInstrs.erase(CopyMI))
+    // But we may encounter the instruction again in this iteration.
+    CurrentErasedInstrs.insert(CopyMI);
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
@@ -3982,21 +3987,33 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
+  SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
     if (!MI)
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.count(MI)) {
+    if (ErasedInstrs.count(MI) || CurrentErasedInstrs.count(MI)) {
       MI = nullptr;
       continue;
     }
     bool Again = false;
-    bool Success = joinCopy(MI, Again);
+    bool Success = joinCopy(MI, Again, CurrentErasedInstrs);
     Progress |= Success;
     if (Success || !Again)
       MI = nullptr;
   }
+  // Clear instructions not recorded in `ErasedInstrs` but erased.
+  if (!CurrentErasedInstrs.empty()) {
+    for (MachineInstr *&MI : CurrList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+    for (MachineInstr *&MI : WorkList) {
+      if (MI && CurrentErasedInstrs.count(MI))
+        MI = nullptr;
+    }
+  }
   return Progress;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4adea020011b..52011e593f2e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9252,7 +9252,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
 
   // Transfer chain users from old loads to the new load.
   for (LoadSDNode *L : Loads)
-    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+    DAG.makeEquivalentMemoryOrdering(L, NewLoad);
 
   if (!NeedsBswap)
     return NewLoad;
@@ -12739,12 +12739,12 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 /// Vector extends are not folded if operations are legal; this is to
 /// avoid introducing illegal build_vector dag nodes.
-static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
+static SDValue tryToFoldExtendOfConstant(SDNode *N, const SDLoc &DL,
+                                         const TargetLowering &TLI,
                                          SelectionDAG &DAG, bool LegalTypes) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
 
   assert((ISD::isExtOpcode(Opcode) || ISD::isExtVecInRegOpcode(Opcode)) &&
          "Expected EXTEND dag node in input!");
@@ -13400,7 +13400,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (sext (sext x)) -> (sext x)
@@ -13669,7 +13669,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (zext (zext x)) -> (zext x)
@@ -13937,12 +13937,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // aext(undef) = undef
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   // fold (aext (aext x)) -> (aext x)
@@ -13951,7 +13952,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::ANY_EXTEND  ||
       N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
 
   // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x)
   // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
@@ -13959,7 +13960,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
 
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
@@ -13977,7 +13978,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
   // fold (aext (truncate x))
   if (N0.getOpcode() == ISD::TRUNCATE)
-    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), DL, VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
@@ -13985,7 +13986,6 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       !TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType())) {
-    SDLoc DL(N);
     SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
     SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
     assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
@@ -14011,9 +14011,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
           ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(), LN0->getBasePtr(),
-                                       N0.getValueType(), LN0->getMemOperand());
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, LN0->getChain(),
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
@@ -14039,9 +14039,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
     if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
-                                       VT, LN0->getChain(), LN0->getBasePtr(),
-                                       MemVT, LN0->getMemOperand());
+      SDValue ExtLoad =
+          DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), LN0->getBasePtr(),
+                         MemVT, LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
       recursivelyDeleteUnusedNodes(LN0);
@@ -14069,23 +14069,20 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == N00VT.getSizeInBits())
-        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
-                             N0.getOperand(1),
-                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSetCC(DL, VT, N0.getOperand(0), N0.getOperand(1),
+                            cast<CondCodeSDNode>(N0.getOperand(2))->get());
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/any extend
       EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
-      SDValue VsetCC =
-        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
-                      N0.getOperand(1),
-                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
-      return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
+      SDValue VsetCC = DAG.getSetCC(
+          DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1),
+          cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getAnyExtOrTrunc(VsetCC, DL, VT);
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
-    SDLoc DL(N);
     if (SDValue SCC = SimplifySelectCC(
             DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
             DAG.getConstant(0, DL, VT),
@@ -14637,10 +14634,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   return SDValue();
 }
 
-static SDValue
-foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI,
-                                         SelectionDAG &DAG,
-                                         bool LegalOperations) {
+static SDValue foldExtendVectorInregToExtendOfSubvector(
+    SDNode *N, const SDLoc &DL, const TargetLowering &TLI, SelectionDAG &DAG,
+    bool LegalOperations) {
   unsigned InregOpcode = N->getOpcode();
   unsigned Opcode = DAG.getOpcode_EXTEND(InregOpcode);
 
@@ -14667,28 +14663,29 @@ foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI,
   if (LegalOperations && !TLI.isOperationLegal(Opcode, VT))
     return SDValue();
 
-  return DAG.getNode(Opcode, SDLoc(N), VT, Src);
+  return DAG.getNode(Opcode, DL, VT, Src);
 }
 
 SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if (N0.isUndef()) {
     // aext_vector_inreg(undef) = undef because the top bits are undefined.
     // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same.
     return N->getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG
                ? DAG.getUNDEF(VT)
-               : DAG.getConstant(0, SDLoc(N), VT);
+               : DAG.getConstant(0, DL, VT);
   }
 
-  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+  if (SDValue Res = tryToFoldExtendOfConstant(N, DL, TLI, DAG, LegalTypes))
     return Res;
 
   if (SimplifyDemandedVectorElts(SDValue(N, 0)))
     return SDValue(N, 0);
 
-  if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, TLI, DAG,
+  if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, DL, TLI, DAG,
                                                            LegalOperations))
     return R;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 4df79f474e8d..f8756527da87 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1197,7 +1197,8 @@ void FastISel::handleDbgInfo(const Instruction *II) {
       V = DPV.getVariableLocationOp(0);
 
     bool Res = false;
-    if (DPV.getType() == DPValue::LocationType::Value) {
+    if (DPV.getType() == DPValue::LocationType::Value ||
+        DPV.getType() == DPValue::LocationType::Assign) {
       Res = lowerDbgValue(V, DPV.getExpression(), DPV.getVariable(),
                           DPV.getDebugLoc());
     } else {
@@ -1393,6 +1394,13 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 
     return true;
   }
+  case Intrinsic::dbg_assign:
+    // A dbg.assign is a dbg.value with more information, typically produced
+    // during optimisation. If one reaches fastisel then something odd has
+    // happened (such as an optimised function being always-inlined into an
+    // optnone function). We will not be using the extra information in the
+    // dbg.assign in that case, just use its dbg.value fields.
+    LLVM_FALLTHROUGH;
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst *DI = cast<DbgValueInst>(II);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 892bfbd62f0d..252b6e9997a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1127,8 +1127,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       Action = TargetLowering::Custom;
     break;
   case ISD::READCYCLECOUNTER:
-    // READCYCLECOUNTER returns an i64, even if type legalization might have
-    // expanded that to several smaller types.
+  case ISD::READSTEADYCOUNTER:
+    // READCYCLECOUNTER and READSTEADYCOUNTER return a i64, even if type
+    // legalization might have expanded that to several smaller types.
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
     break;
   case ISD::READ_REGISTER:
@@ -3080,6 +3081,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Node->getOperand(0));
     break;
   case ISD::READCYCLECOUNTER:
+  case ISD::READSTEADYCOUNTER:
     // If the target didn't expand this, just return 'zero' and preserve the
     // chain.
     Results.append(Node->getNumValues() - 1,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 39b7e0615541..a4ba261686c6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2648,7 +2648,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::LLRINT:      ExpandIntRes_XROUND_XRINT(N, Lo, Hi); break;
   case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
   case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
-  case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
+  case ISD::READCYCLECOUNTER:
+  case ISD::READSTEADYCOUNTER: ExpandIntRes_READCOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
@@ -2824,25 +2825,26 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   EVT NVT = InL.getValueType();
   unsigned VTBits = N->getValueType(0).getSizeInBits();
   unsigned NVTBits = NVT.getSizeInBits();
-  EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
     if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
-      Hi = DAG.getNode(ISD::SHL, DL,
-                       NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Hi = DAG.getNode(ISD::SHL, DL, NVT, InL,
+                       DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
     } else if (Amt == NVTBits) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = InL;
     } else {
-      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
-      Hi = DAG.getNode(ISD::OR, DL, NVT,
-                       DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                   DAG.getConstant(Amt, DL, ShTy)),
-                       DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL,
+                       DAG.getShiftAmountConstant(Amt, NVT, DL));
+      Hi = DAG.getNode(
+          ISD::OR, DL, NVT,
+          DAG.getNode(ISD::SHL, DL, NVT, InH,
+                      DAG.getShiftAmountConstant(Amt, NVT, DL)),
+          DAG.getNode(ISD::SRL, DL, NVT, InL,
+                      DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
     }
     return;
   }
@@ -2851,19 +2853,21 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
     if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
-      Lo = DAG.getNode(ISD::SRL, DL,
-                       NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Lo = DAG.getNode(ISD::SRL, DL, NVT, InH,
+                       DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
       Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt == NVTBits) {
       Lo = InH;
       Hi = DAG.getConstant(0, DL, NVT);
     } else {
-      Lo = DAG.getNode(ISD::OR, DL, NVT,
-                       DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                   DAG.getConstant(Amt, DL, ShTy)),
-                       DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
-      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+      Lo = DAG.getNode(
+          ISD::OR, DL, NVT,
+          DAG.getNode(ISD::SRL, DL, NVT, InL,
+                      DAG.getShiftAmountConstant(Amt, NVT, DL)),
+          DAG.getNode(ISD::SHL, DL, NVT, InH,
+                      DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH,
+                       DAG.getShiftAmountConstant(Amt, NVT, DL));
     }
     return;
   }
@@ -2871,23 +2875,25 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
   if (Amt.uge(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                          DAG.getConstant(NVTBits - 1, DL, ShTy));
+                          DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else if (Amt.ugt(NVTBits)) {
     Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(Amt - NVTBits, DL, ShTy));
+                     DAG.getShiftAmountConstant(Amt - NVTBits, NVT, DL));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else if (Amt == NVTBits) {
     Lo = InH;
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
   } else {
-    Lo = DAG.getNode(ISD::OR, DL, NVT,
-                     DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                 DAG.getConstant(Amt, DL, ShTy)),
-                     DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                 DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
-    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+    Lo = DAG.getNode(
+        ISD::OR, DL, NVT,
+        DAG.getNode(ISD::SRL, DL, NVT, InL,
+                    DAG.getShiftAmountConstant(Amt, NVT, DL)),
+        DAG.getNode(ISD::SHL, DL, NVT, InH,
+                    DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getShiftAmountConstant(Amt, NVT, DL));
   }
 }
 
@@ -4026,8 +4032,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
                Lo, Hi);
 }
 
-void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
-                                                     SDValue &Hi) {
+void DAGTypeLegalizer::ExpandIntRes_READCOUNTER(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
   SDLoc DL(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 09f0bca8b861..911498716285 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -439,7 +439,7 @@ private:
   void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_READCYCLECOUNTER  (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READCOUNTER       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 55eee780d512..421bb516ad24 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1734,6 +1734,12 @@ SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
   return getConstant(Val, DL, ShiftVT);
 }
 
+SDValue SelectionDAG::getShiftAmountConstant(const APInt &Val, EVT VT,
+                                             const SDLoc &DL, bool LegalTypes) {
+  assert(Val.ult(VT.getScalarSizeInBits()) && "Out of range shift");
+  return getShiftAmountConstant(Val.getZExtValue(), VT, DL, LegalTypes);
+}
+
 SDValue SelectionDAG::getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
                                            bool isTarget) {
   return getConstant(Val, DL, TLI->getVectorIdxTy(getDataLayout()), isTarget);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 5ce1013f30fd..28664b2ed905 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6781,6 +6781,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DAG.setRoot(Res.getValue(1));
     return;
   }
+  case Intrinsic::readsteadycounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READSTEADYCOUNTER, sdl,
+                      DAG.getVTList(MVT::i64, MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
   case Intrinsic::bitreverse:
     setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a28d834f0522..0fbd999694f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -104,6 +104,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ATOMIC_STORE:               return "AtomicStore";
   case ISD::PCMARKER:                   return "PCMarker";
   case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::READSTEADYCOUNTER:          return "ReadSteadyCounter";
   case ISD::SRCVALUE:                   return "SrcValue";
   case ISD::MDNODE_SDNODE:              return "MDNode";
   case ISD::EntryToken:                 return "EntryToken";
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 1320479915e3..d7f4d1c89375 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -1287,7 +1287,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) {
     // Lowering relocate(undef) as arbitrary constant. Current constant value
     // is chosen such that it's unlikely to be a valid pointer.
-    setValue(&Relocate, DAG.getTargetConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
+    setValue(&Relocate, DAG.getConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
     return;
   }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 16cd14ba3de9..dc766928e5dc 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -122,6 +122,46 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
   for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
     setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
 
+  // Use the f128 variants of math functions on x86_64
+  if (TT.getArch() == Triple::ArchType::x86_64) {
+    setLibcallName(RTLIB::REM_F128, "fmodf128");
+    setLibcallName(RTLIB::FMA_F128, "fmaf128");
+    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+    setLibcallName(RTLIB::LOG_F128, "logf128");
+    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    setLibcallName(RTLIB::LOG2_F128, "log2f128");
+    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    setLibcallName(RTLIB::LOG10_F128, "log10f128");
+    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    setLibcallName(RTLIB::EXP_F128, "expf128");
+    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+    setLibcallName(RTLIB::SIN_F128, "sinf128");
+    setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+    setLibcallName(RTLIB::POW_F128, "powf128");
+    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+    setLibcallName(RTLIB::RINT_F128, "rintf128");
+    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+    setLibcallName(RTLIB::ROUND_F128, "roundf128");
+    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+    setLibcallName(RTLIB::FMIN_F128, "fminf128");
+    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  }
+
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
     setLibcallName(RTLIB::ADD_F128, "__addkf3");
@@ -924,6 +964,9 @@ void TargetLoweringBase::initActions() {
   // Most targets also ignore the @llvm.readcyclecounter intrinsic.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);
 
+  // Most targets also ignore the @llvm.readsteadycounter intrinsic.
+  setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Expand);
+
   // ConstantFP nodes default to expand.  Targets can either change this to
   // Legal, in which case all fp constants are legal, or use isFPImmLegal()
   // to optimize expansions for certain constants.
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 053caf518bd1..7a3bc6c2043f 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -937,6 +937,8 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
       return 0;
 
     EVT PromotedVT = TLI->getTypeToTransformTo(*Ctx, SrcVT);
+    if (TLI->isSExtCheaperThanZExt(SrcVT, PromotedVT))
+      return 0;
     if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) {
       LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register "
                         << "for promoted type\n");
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index ba3b9e00e34e..731fcabaee40 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -264,6 +264,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:
     return FixedVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::v3i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 3);
   case MVT::v4i8:
     return FixedVectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
index 527c7a0ec364..7c17c5b79c7c 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
@@ -351,6 +351,9 @@ protected:
   /// \defgroup Data members accessed sequentially.
   ///
   /// @{
+  /// Data global for the whole linking process.
+  LinkingGlobalData GlobalData;
+
   /// DwarfStringPoolEntries for .debug_str section.
   StringEntryToDwarfStringPoolEntryMap DebugStrStrings;
 
@@ -368,9 +371,6 @@ protected:
 
   /// Overall compile units number.
   uint64_t OverallNumberOfCU = 0;
-
-  /// Data global for the whole linking process.
-  LinkingGlobalData GlobalData;
   /// @}
 };
 
diff --git a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
index 7e3087373bfa..1ff34bd27e36 100644
--- a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -95,17 +95,19 @@ static const EnumEntry<uint8_t> FrameCookieKinds[] = {
 };
 
 static const EnumEntry<codeview::SourceLanguage> SourceLanguages[] = {
-    CV_ENUM_ENT(SourceLanguage, C),       CV_ENUM_ENT(SourceLanguage, Cpp),
-    CV_ENUM_ENT(SourceLanguage, Fortran), CV_ENUM_ENT(SourceLanguage, Masm),
-    CV_ENUM_ENT(SourceLanguage, Pascal),  CV_ENUM_ENT(SourceLanguage, Basic),
-    CV_ENUM_ENT(SourceLanguage, Cobol),   CV_ENUM_ENT(SourceLanguage, Link),
-    CV_ENUM_ENT(SourceLanguage, Cvtres),  CV_ENUM_ENT(SourceLanguage, Cvtpgd),
-    CV_ENUM_ENT(SourceLanguage, CSharp),  CV_ENUM_ENT(SourceLanguage, VB),
-    CV_ENUM_ENT(SourceLanguage, ILAsm),   CV_ENUM_ENT(SourceLanguage, Java),
-    CV_ENUM_ENT(SourceLanguage, JScript), CV_ENUM_ENT(SourceLanguage, MSIL),
-    CV_ENUM_ENT(SourceLanguage, HLSL),    CV_ENUM_ENT(SourceLanguage, D),
-    CV_ENUM_ENT(SourceLanguage, Swift),   CV_ENUM_ENT(SourceLanguage, Rust),
-    CV_ENUM_ENT(SourceLanguage, ObjC),    CV_ENUM_ENT(SourceLanguage, ObjCpp),
+    CV_ENUM_ENT(SourceLanguage, C),        CV_ENUM_ENT(SourceLanguage, Cpp),
+    CV_ENUM_ENT(SourceLanguage, Fortran),  CV_ENUM_ENT(SourceLanguage, Masm),
+    CV_ENUM_ENT(SourceLanguage, Pascal),   CV_ENUM_ENT(SourceLanguage, Basic),
+    CV_ENUM_ENT(SourceLanguage, Cobol),    CV_ENUM_ENT(SourceLanguage, Link),
+    CV_ENUM_ENT(SourceLanguage, Cvtres),   CV_ENUM_ENT(SourceLanguage, Cvtpgd),
+    CV_ENUM_ENT(SourceLanguage, CSharp),   CV_ENUM_ENT(SourceLanguage, VB),
+    CV_ENUM_ENT(SourceLanguage, ILAsm),    CV_ENUM_ENT(SourceLanguage, Java),
+    CV_ENUM_ENT(SourceLanguage, JScript),  CV_ENUM_ENT(SourceLanguage, MSIL),
+    CV_ENUM_ENT(SourceLanguage, HLSL),     CV_ENUM_ENT(SourceLanguage, D),
+    CV_ENUM_ENT(SourceLanguage, Swift),    CV_ENUM_ENT(SourceLanguage, Rust),
+    CV_ENUM_ENT(SourceLanguage, ObjC),     CV_ENUM_ENT(SourceLanguage, ObjCpp),
+    CV_ENUM_ENT(SourceLanguage, AliasObj), CV_ENUM_ENT(SourceLanguage, Go),
+    {"Swift", SourceLanguage::OldSwift},
 };
 
 static const EnumEntry<uint32_t> CompileSym2FlagNames[] = {
@@ -205,6 +207,7 @@ static const EnumEntry<unsigned> CPUTypeNames[] = {
     CV_ENUM_CLASS_ENT(CPUType, HybridX86ARM64),
     CV_ENUM_CLASS_ENT(CPUType, ARM64EC),
     CV_ENUM_CLASS_ENT(CPUType, ARM64X),
+    CV_ENUM_CLASS_ENT(CPUType, Unknown),
     CV_ENUM_CLASS_ENT(CPUType, D3D11_Shader),
 };
 
@@ -421,6 +424,7 @@ static const EnumEntry<uint8_t> CallingConventions[] = {
     CV_ENUM_CLASS_ENT(CallingConvention, ClrCall),
     CV_ENUM_CLASS_ENT(CallingConvention, Inline),
     CV_ENUM_CLASS_ENT(CallingConvention, NearVector),
+    CV_ENUM_CLASS_ENT(CallingConvention, Swift),
 };
 
 static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
diff --git a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index df7e42df1afc..776676410e78 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -133,6 +133,7 @@ static const EnumEntry<uint8_t> CallingConventions[] = {
     ENUM_ENTRY(CallingConvention, ClrCall),
     ENUM_ENTRY(CallingConvention, Inline),
     ENUM_ENTRY(CallingConvention, NearVector),
+    ENUM_ENTRY(CallingConvention, Swift),
 };
 
 static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 2124ff835c57..20ef59e7b442 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -1025,6 +1025,11 @@ void DWARFVerifier::verifyDebugLineRows() {
       FileIndex++;
     }
 
+    // Nothing to verify in a line table with a single row containing the end
+    // sequence.
+    if (LineTable->Rows.size() == 1 && LineTable->Rows.front().EndSequence)
+      continue;
+
     // Verify rows.
     uint64_t PrevAddress = 0;
     uint32_t RowIndex = 0;
@@ -1048,13 +1053,7 @@ void DWARFVerifier::verifyDebugLineRows() {
             });
       }
 
-      // If the prologue contains no file names and the line table has only one
-      // row, do not verify the file index, this is a line table of an empty
-      // file with an end_sequence, but the DWARF standard sets the file number
-      // to 1 by default, otherwise verify file index.
-      if ((LineTable->Prologue.FileNames.size() ||
-           LineTable->Rows.size() != 1) &&
-          !LineTable->hasFileAtIndex(Row.File)) {
+      if (!LineTable->hasFileAtIndex(Row.File)) {
         ++NumDebugLineErrors;
         ErrorCategory.Report("Invalid file index in debug_line", [&]() {
           error() << ".debug_line["
@@ -1881,33 +1880,38 @@ bool DWARFVerifier::handleDebugStrOffsets() {
   OS << "Verifying .debug_str_offsets...\n";
   const DWARFObject &DObj = DCtx.getDWARFObj();
   bool Success = true;
+
+  // dwo sections may contain the legacy debug_str_offsets format (and they
+  // can't be mixed with dwarf 5's format). This section format contains no
+  // header.
+  // As such, check the version from debug_info and, if we are in the legacy
+  // mode (Dwarf <= 4), extract Dwarf32/Dwarf64.
+  std::optional<DwarfFormat> DwoLegacyDwarf4Format;
+  DObj.forEachInfoDWOSections([&](const DWARFSection &S) {
+    if (DwoLegacyDwarf4Format)
+      return;
+    DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
+    uint64_t Offset = 0;
+    DwarfFormat InfoFormat = DebugInfoData.getInitialLength(&Offset).second;
+    if (uint16_t InfoVersion = DebugInfoData.getU16(&Offset); InfoVersion <= 4)
+      DwoLegacyDwarf4Format = InfoFormat;
+  });
+
   Success &= verifyDebugStrOffsets(
-      ".debug_str_offsets.dwo", DObj.getStrOffsetsDWOSection(),
-      DObj.getStrDWOSection(), &DWARFObject::forEachInfoDWOSections);
+      DwoLegacyDwarf4Format, ".debug_str_offsets.dwo",
+      DObj.getStrOffsetsDWOSection(), DObj.getStrDWOSection());
   Success &= verifyDebugStrOffsets(
-      ".debug_str_offsets", DObj.getStrOffsetsSection(), DObj.getStrSection(),
-      &DWARFObject::forEachInfoSections);
+      /*LegacyFormat=*/std::nullopt, ".debug_str_offsets",
+      DObj.getStrOffsetsSection(), DObj.getStrSection());
   return Success;
 }
 
 bool DWARFVerifier::verifyDebugStrOffsets(
-    StringRef SectionName, const DWARFSection &Section, StringRef StrData,
-    void (DWARFObject::*VisitInfoSections)(
-        function_ref<void(const DWARFSection &)>) const) {
+    std::optional<DwarfFormat> LegacyFormat, StringRef SectionName,
+    const DWARFSection &Section, StringRef StrData) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
-  uint16_t InfoVersion = 0;
-  DwarfFormat InfoFormat = DwarfFormat::DWARF32;
-  (DObj.*VisitInfoSections)([&](const DWARFSection &S) {
-    if (InfoVersion)
-      return;
-    DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
-    uint64_t Offset = 0;
-    InfoFormat = DebugInfoData.getInitialLength(&Offset).second;
-    InfoVersion = DebugInfoData.getU16(&Offset);
-  });
 
   DWARFDataExtractor DA(DObj, Section, DCtx.isLittleEndian(), 0);
-
   DataExtractor::Cursor C(0);
   uint64_t NextUnit = 0;
   bool Success = true;
@@ -1915,8 +1919,8 @@ bool DWARFVerifier::verifyDebugStrOffsets(
     DwarfFormat Format;
     uint64_t Length;
     uint64_t StartOffset = C.tell();
-    if (InfoVersion == 4) {
-      Format = InfoFormat;
+    if (LegacyFormat) {
+      Format = *LegacyFormat;
       Length = DA.getData().size();
       NextUnit = C.tell() + Length;
     } else {
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 6268c7845a14..3a28cd412de9 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -21,6 +21,8 @@
 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
 #include "llvm/DebugInfo/GSYM/GsymReader.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
+
 #include <optional>
 
 using namespace llvm;
@@ -215,9 +217,9 @@ ConvertDWARFRanges(const DWARFAddressRangesVector &DwarfRanges) {
   return Ranges;
 }
 
-static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
-                            DWARFDie Die, uint32_t Depth, FunctionInfo &FI,
-                            InlineInfo &Parent,
+static void parseInlineInfo(GsymCreator &Gsym, OutputAggregator &Out,
+                            CUInfo &CUI, DWARFDie Die, uint32_t Depth,
+                            FunctionInfo &FI, InlineInfo &Parent,
                             const AddressRanges &AllParentRanges,
                             bool &WarnIfEmpty) {
   if (!hasInlineInfo(Die, Depth))
@@ -250,14 +252,18 @@ static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
             // parsing for.
             if (AllParentRanges.contains(InlineRange)) {
               WarnIfEmpty = false;
-            } else if (Log) {
-              *Log << "error: inlined function DIE at "
-                   << HEX32(Die.getOffset()) << " has a range ["
-                   << HEX64(InlineRange.start()) << " - "
-                   << HEX64(InlineRange.end()) << ") that isn't contained in "
-                   << "any parent address ranges, this inline range will be "
-                      "removed.\n";
-            }
+            } else
+              Out.Report("Function DIE has uncontained address range",
+                         [&](raw_ostream &OS) {
+                           OS << "error: inlined function DIE at "
+                              << HEX32(Die.getOffset()) << " has a range ["
+                              << HEX64(InlineRange.start()) << " - "
+                              << HEX64(InlineRange.end())
+                              << ") that isn't contained in "
+                              << "any parent address ranges, this inline range "
+                                 "will be "
+                                 "removed.\n";
+                         });
           }
         }
       }
@@ -281,26 +287,30 @@ static void parseInlineInfo(GsymCreator &Gsym, raw_ostream *Log, CUInfo &CUI,
       II.CallLine = dwarf::toUnsigned(Die.find(dwarf::DW_AT_call_line), 0);
       // parse all children and append to parent
       for (DWARFDie ChildDie : Die.children())
-        parseInlineInfo(Gsym, Log, CUI, ChildDie, Depth + 1, FI, II,
+        parseInlineInfo(Gsym, Out, CUI, ChildDie, Depth + 1, FI, II,
                         AllInlineRanges, WarnIfEmpty);
       Parent.Children.emplace_back(std::move(II));
-    } else if (Log) {
-      *Log << "error: inlined function DIE at " << HEX32(Die.getOffset())
-           << " has an invalid file index " << DwarfFileIdx
-           << " in its DW_AT_call_file attribute, this inline entry and all "
-           << "children will be removed.\n";
-    }
+    } else
+      Out.Report(
+          "Inlined function die has invlaid file index in DW_AT_call_file",
+          [&](raw_ostream &OS) {
+            OS << "error: inlined function DIE at " << HEX32(Die.getOffset())
+               << " has an invalid file index " << DwarfFileIdx
+               << " in its DW_AT_call_file attribute, this inline entry and "
+                  "all "
+               << "children will be removed.\n";
+          });
     return;
   }
   if (Tag == dwarf::DW_TAG_subprogram || Tag == dwarf::DW_TAG_lexical_block) {
     // skip this Die and just recurse down
     for (DWARFDie ChildDie : Die.children())
-      parseInlineInfo(Gsym, Log, CUI, ChildDie, Depth + 1, FI, Parent,
+      parseInlineInfo(Gsym, Out, CUI, ChildDie, Depth + 1, FI, Parent,
                       AllParentRanges, WarnIfEmpty);
   }
 }
 
-static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
+static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI,
                                      DWARFDie Die, GsymCreator &Gsym,
                                      FunctionInfo &FI) {
   std::vector<uint32_t> RowVector;
@@ -319,15 +329,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     if (FilePath.empty()) {
       // If we had a DW_AT_decl_file, but got no file then we need to emit a
       // warning.
-      if (Log) {
+      Out.Report("Invalid file index in DW_AT_decl_file", [&](raw_ostream &OS) {
         const uint64_t DwarfFileIdx = dwarf::toUnsigned(
             Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
-        *Log << "error: function DIE at " << HEX32(Die.getOffset())
-             << " has an invalid file index " << DwarfFileIdx
-             << " in its DW_AT_decl_file attribute, unable to create a single "
-             << "line entry from the DW_AT_decl_file/DW_AT_decl_line "
-             << "attributes.\n";
-      }
+        OS << "error: function DIE at " << HEX32(Die.getOffset())
+           << " has an invalid file index " << DwarfFileIdx
+           << " in its DW_AT_decl_file attribute, unable to create a single "
+           << "line entry from the DW_AT_decl_file/DW_AT_decl_line "
+           << "attributes.\n";
+      });
       return;
     }
     if (auto Line =
@@ -347,14 +357,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     std::optional<uint32_t> OptFileIdx =
         CUI.DWARFToGSYMFileIndex(Gsym, Row.File);
     if (!OptFileIdx) {
-      if (Log) {
-        *Log << "error: function DIE at " << HEX32(Die.getOffset()) << " has "
-             << "a line entry with invalid DWARF file index, this entry will "
-             << "be removed:\n";
-        Row.dumpTableHeader(*Log, /*Indent=*/0);
-        Row.dump(*Log);
-        *Log << "\n";
-      }
+      Out.Report(
+          "Invalid file index in DWARF line table", [&](raw_ostream &OS) {
+            OS << "error: function DIE at " << HEX32(Die.getOffset()) << " has "
+               << "a line entry with invalid DWARF file index, this entry will "
+               << "be removed:\n";
+            Row.dumpTableHeader(OS, /*Indent=*/0);
+            Row.dump(OS);
+            OS << "\n";
+          });
       continue;
     }
     const uint32_t FileIdx = OptFileIdx.value();
@@ -367,12 +378,15 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     // an error, but not worth stopping the creation of the GSYM.
     if (!FI.Range.contains(RowAddress)) {
       if (RowAddress < FI.Range.start()) {
-        if (Log) {
-          *Log << "error: DIE has a start address whose LowPC is between the "
-                  "line table Row[" << RowIndex << "] with address "
-               << HEX64(RowAddress) << " and the next one.\n";
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
+        Out.Report("Start address lies between valid Row table entries",
+                   [&](raw_ostream &OS) {
+                     OS << "error: DIE has a start address whose LowPC is "
+                           "between the "
+                           "line table Row["
+                        << RowIndex << "] with address " << HEX64(RowAddress)
+                        << " and the next one.\n";
+                     Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                   });
         RowAddress = FI.Range.start();
       } else {
         continue;
@@ -388,20 +402,21 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
       // line entry we already have in our "function_info.Lines". If
       // so break out after printing a warning.
       auto FirstLE = FI.OptLineTable->first();
-      if (FirstLE && *FirstLE == LE) {
-        if (Log && !Gsym.isQuiet()) {
-          *Log << "warning: duplicate line table detected for DIE:\n";
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
-      } else {
-        if (Log) {
-          *Log << "error: line table has addresses that do not "
-               << "monotonically increase:\n";
-          for (uint32_t RowIndex2 : RowVector)
-            CUI.LineTable->Rows[RowIndex2].dump(*Log);
-          Die.dump(*Log, 0, DIDumpOptions::getForSingleDIE());
-        }
-      }
+      if (FirstLE && *FirstLE == LE)
+        // if (Log && !Gsym.isQuiet()) { TODO <-- This looks weird
+        Out.Report("Duplicate line table detected", [&](raw_ostream &OS) {
+          OS << "warning: duplicate line table detected for DIE:\n";
+          Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+        });
+      else
+        Out.Report("Non-monotonically increasing addresses",
+                   [&](raw_ostream &OS) {
+                     OS << "error: line table has addresses that do not "
+                        << "monotonically increase:\n";
+                     for (uint32_t RowIndex2 : RowVector)
+                       CUI.LineTable->Rows[RowIndex2].dump(OS);
+                     Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                   });
       break;
     }
 
@@ -429,7 +444,8 @@ static void convertFunctionLineTable(raw_ostream *Log, CUInfo &CUI,
     FI.OptLineTable = std::nullopt;
 }
 
-void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
+void DwarfTransformer::handleDie(OutputAggregator &Out, CUInfo &CUI,
+                                 DWARFDie Die) {
   switch (Die.getTag()) {
   case dwarf::DW_TAG_subprogram: {
     Expected<DWARFAddressRangesVector> RangesOrError = Die.getAddressRanges();
@@ -442,11 +458,11 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
       break;
     auto NameIndex = getQualifiedNameIndex(Die, CUI.Language, Gsym);
     if (!NameIndex) {
-      if (OS) {
-        *OS << "error: function at " << HEX64(Die.getOffset())
-            << " has no name\n ";
-        Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-      }
+      Out.Report("Function has no name", [&](raw_ostream &OS) {
+        OS << "error: function at " << HEX64(Die.getOffset())
+           << " has no name\n ";
+        Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+      });
       break;
     }
     // All ranges for the subprogram DIE in case it has multiple. We need to
@@ -472,8 +488,8 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
 
       // Many linkers can't remove DWARF and might set the LowPC to zero. Since
       // high PC can be an offset from the low PC in more recent DWARF versions
-      // we need to watch for a zero'ed low pc which we do using
-      // ValidTextRanges below.
+      // we need to watch for a zero'ed low pc which we do using ValidTextRanges
+      // below.
       if (!Gsym.IsValidTextAddress(Range.LowPC)) {
         // We expect zero and -1 to be invalid addresses in DWARF depending
         // on the linker of the DWARF. This indicates a function was stripped
@@ -482,13 +498,15 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
         if (Range.LowPC != 0) {
           if (!Gsym.isQuiet()) {
             // Unexpected invalid address, emit a warning
-            if (OS) {
-              *OS << "warning: DIE has an address range whose start address "
-                     "is not in any executable sections ("
-                  << *Gsym.GetValidTextRanges()
-                  << ") and will not be processed:\n";
-              Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-            }
+            Out.Report("Address range starts outside executable section",
+                       [&](raw_ostream &OS) {
+                         OS << "warning: DIE has an address range whose "
+                               "start address "
+                               "is not in any executable sections ("
+                            << *Gsym.GetValidTextRanges()
+                            << ") and will not be processed:\n";
+                         Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                       });
           }
         }
         break;
@@ -498,14 +516,14 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
       FI.Range = {Range.LowPC, Range.HighPC};
       FI.Name = *NameIndex;
       if (CUI.LineTable)
-        convertFunctionLineTable(OS, CUI, Die, Gsym, FI);
+        convertFunctionLineTable(Out, CUI, Die, Gsym, FI);
 
       if (hasInlineInfo(Die, 0)) {
         FI.Inline = InlineInfo();
         FI.Inline->Name = *NameIndex;
         FI.Inline->Ranges.insert(FI.Range);
         bool WarnIfEmpty = true;
-        parseInlineInfo(Gsym, OS, CUI, Die, 0, FI, *FI.Inline,
+        parseInlineInfo(Gsym, Out, CUI, Die, 0, FI, *FI.Inline,
                         AllSubprogramRanges, WarnIfEmpty);
         // Make sure we at least got some valid inline info other than just
         // the top level function. If we didn't then remove the inline info
@@ -517,11 +535,14 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
         // information object, we will know if we got anything valid from the
         // debug info.
         if (FI.Inline->Children.empty()) {
-          if (WarnIfEmpty && OS && !Gsym.isQuiet()) {
-            *OS << "warning: DIE contains inline function information that has "
-                  "no valid ranges, removing inline information:\n";
-            Die.dump(*OS, 0, DIDumpOptions::getForSingleDIE());
-          }
+          if (WarnIfEmpty && !Gsym.isQuiet())
+            Out.Report("DIE contains inline functions with no valid ranges",
+                       [&](raw_ostream &OS) {
+                         OS << "warning: DIE contains inline function "
+                               "information that has no valid ranges, removing "
+                               "inline information:\n";
+                         Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+                       });
           FI.Inline = std::nullopt;
         }
       }
@@ -532,33 +553,28 @@ void DwarfTransformer::handleDie(raw_ostream *OS, CUInfo &CUI, DWARFDie Die) {
     break;
   }
   for (DWARFDie ChildDie : Die.children())
-    handleDie(OS, CUI, ChildDie);
+    handleDie(Out, CUI, ChildDie);
 }
 
-Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
+Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
   size_t NumBefore = Gsym.getNumFunctionInfos();
-  std::once_flag flag;
   auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
     DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
     if (DwarfUnit.getDWOId()) {
       DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
-      if (!DWOCU->isDWOUnit()) {
-        if (OS) {
-          std::string DWOName = dwarf::toString(
-              DwarfUnit.getUnitDIE().find(
-                  {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
-              "");
-          *OS << "warning: Unable to retrieve DWO .debug_info section for "
-              << DWOName << "\n";
-        } else {
-          std::call_once(flag, []() {
-            outs()
-                << "warning: Unable to retrieve DWO .debug_info section for "
-                   "some "
-                   "object files. (Remove the --quiet flag for full output)\n";
-          });
-        }
-      } else {
+      if (!DWOCU->isDWOUnit())
+        Out.Report(
+            "warning: Unable to retrieve DWO .debug_info section for some "
+            "object files. (Remove the --quiet flag for full output)",
+            [&](raw_ostream &OS) {
+              std::string DWOName = dwarf::toString(
+                  DwarfUnit.getUnitDIE().find(
+                      {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+                  "");
+              OS << "warning: Unable to retrieve DWO .debug_info section for "
+                 << DWOName << "\n";
+            });
+      else {
         ReturnDie = DWOCU->getUnitDIE(false);
       }
     }
@@ -570,7 +586,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
     for (const auto &CU : DICtx.compile_units()) {
       DWARFDie Die = getDie(*CU);
       CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
-      handleDie(OS, CUI, Die);
+      handleDie(Out, CUI, Die);
     }
   } else {
     // LLVM Dwarf parser is not thread-safe and we need to parse all DWARF up
@@ -596,29 +612,31 @@ Error DwarfTransformer::convert(uint32_t NumThreads, raw_ostream *OS) {
       DWARFDie Die = getDie(*CU);
       if (Die) {
         CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
-        pool.async([this, CUI, &LogMutex, OS, Die]() mutable {
-          std::string ThreadLogStorage;
-          raw_string_ostream ThreadOS(ThreadLogStorage);
-          handleDie(OS ? &ThreadOS: nullptr, CUI, Die);
-          ThreadOS.flush();
-          if (OS && !ThreadLogStorage.empty()) {
-            // Print ThreadLogStorage lines into an actual stream under a lock
-            std::lock_guard<std::mutex> guard(LogMutex);
-            *OS << ThreadLogStorage;
+        pool.async([this, CUI, &LogMutex, Out, Die]() mutable {
+          std::string storage;
+          raw_string_ostream StrStream(storage);
+          OutputAggregator ThreadOut(Out.GetOS() ? &StrStream : nullptr);
+          handleDie(ThreadOut, CUI, Die);
+          // Print ThreadLogStorage lines into an actual stream under a lock
+          std::lock_guard<std::mutex> guard(LogMutex);
+          if (Out.GetOS()) {
+            StrStream.flush();
+            Out << storage;
           }
+          Out.Merge(ThreadOut);
         });
       }
     }
     pool.wait();
   }
   size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
-  if (OS)
-    *OS << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
+  Out << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
   return Error::success();
 }
 
-llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
-  Log << "Verifying GSYM file \"" << GsymPath << "\":\n";
+llvm::Error DwarfTransformer::verify(StringRef GsymPath,
+                                     OutputAggregator &Out) {
+  Out << "Verifying GSYM file \"" << GsymPath << "\":\n";
 
   auto Gsym = GsymReader::openFile(GsymPath);
   if (!Gsym)
@@ -637,9 +655,9 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
 
     auto FI = Gsym->getFunctionInfo(*FuncAddr);
     if (!FI)
-      return createStringError(std::errc::invalid_argument,
-                            "failed to extract function info for address 0x%"
-                            PRIu64, *FuncAddr);
+      return createStringError(
+          std::errc::invalid_argument,
+          "failed to extract function info for address 0x%" PRIu64, *FuncAddr);
 
     for (auto Addr = *FuncAddr; Addr < *FuncAddr + FI->size(); ++Addr) {
       const object::SectionedAddress SectAddr{
@@ -664,24 +682,27 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
       }
       if (NumDwarfInlineInfos > 0 &&
           NumDwarfInlineInfos != LR->Locations.size()) {
-        Log << "error: address " << HEX64(Addr) << " has "
-            << NumDwarfInlineInfos << " DWARF inline frames and GSYM has "
-            << LR->Locations.size() << "\n";
-        Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
-        for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
-          const auto &dii = DwarfInlineInfos.getFrame(Idx);
-          Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
-              << dii.FileName << ':' << dii.Line << '\n';
-        }
-        Log << "    " << LR->Locations.size() << " GSYM frames:\n";
-        for (size_t Idx = 0, count = LR->Locations.size();
-              Idx < count; ++Idx) {
-          const auto &gii = LR->Locations[Idx];
-          Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
-              << '/' << gii.Base << ':' << gii.Line << '\n';
+        if (Out.GetOS()) {
+          raw_ostream &Log = *Out.GetOS();
+          Log << "error: address " << HEX64(Addr) << " has "
+              << NumDwarfInlineInfos << " DWARF inline frames and GSYM has "
+              << LR->Locations.size() << "\n";
+          Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
+          for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
+            const auto &dii = DwarfInlineInfos.getFrame(Idx);
+            Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
+                << dii.FileName << ':' << dii.Line << '\n';
+          }
+          Log << "    " << LR->Locations.size() << " GSYM frames:\n";
+          for (size_t Idx = 0, count = LR->Locations.size(); Idx < count;
+               ++Idx) {
+            const auto &gii = LR->Locations[Idx];
+            Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
+                << '/' << gii.Base << ':' << gii.Line << '\n';
+          }
+          DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
+          Gsym->dump(Log, *FI);
         }
-        DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
-        Gsym->dump(Log, *FI);
         continue;
       }
 
@@ -693,17 +714,18 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, raw_ostream &Log) {
           gsymFilename = LR->getSourceFile(Idx);
           // Verify function name
           if (dii.FunctionName.find(gii.Name.str()) != 0)
-            Log << "error: address " << HEX64(Addr) << " DWARF function \""
+            Out << "error: address " << HEX64(Addr) << " DWARF function \""
                 << dii.FunctionName.c_str()
                 << "\" doesn't match GSYM function \"" << gii.Name << "\"\n";
+
           // Verify source file path
           if (dii.FileName != gsymFilename)
-            Log << "error: address " << HEX64(Addr) << " DWARF path \""
+            Out << "error: address " << HEX64(Addr) << " DWARF path \""
                 << dii.FileName.c_str() << "\" doesn't match GSYM path \""
                 << gsymFilename.c_str() << "\"\n";
           // Verify source file line
           if (dii.Line != gii.Line)
-            Log << "error: address " << HEX64(Addr) << " DWARF line "
+            Out << "error: address " << HEX64(Addr) << " DWARF line "
                 << dii.Line << " != GSYM line " << gii.Line << "\n";
         }
       }
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 74138755090a..deedaeea2fe2 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/GSYM/FileWriter.h"
 #include "llvm/DebugInfo/GSYM/Header.h"
 #include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -188,7 +189,7 @@ llvm::Error GsymCreator::encode(FileWriter &O) const {
   return ErrorSuccess();
 }
 
-llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
+llvm::Error GsymCreator::finalize(OutputAggregator &Out) {
   std::lock_guard<std::mutex> Guard(Mutex);
   if (Finalized)
     return createStringError(std::errc::invalid_argument, "already finalized");
@@ -247,26 +248,29 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
             // address ranges that have debug info are last in
             // the sort.
             if (!(Prev == Curr)) {
-              if (Prev.hasRichInfo() && Curr.hasRichInfo()) {
-                if (!Quiet) {
-                  OS << "warning: same address range contains "
-                        "different debug "
-                    << "info. Removing:\n"
-                    << Prev << "\nIn favor of this one:\n"
-                    << Curr << "\n";
-                }
-              }
+              if (Prev.hasRichInfo() && Curr.hasRichInfo())
+                Out.Report(
+                    "Duplicate address ranges with different debug info.",
+                    [&](raw_ostream &OS) {
+                      OS << "warning: same address range contains "
+                            "different debug "
+                         << "info. Removing:\n"
+                         << Prev << "\nIn favor of this one:\n"
+                         << Curr << "\n";
+                    });
+
               // We want to swap the current entry with the previous since
               // later entries with the same range always have more debug info
               // or different debug info.
               std::swap(Prev, Curr);
             }
           } else {
-            if (!Quiet) { // print warnings about overlaps
+            Out.Report("Overlapping function ranges", [&](raw_ostream &OS) {
+              // print warnings about overlaps
               OS << "warning: function ranges overlap:\n"
                 << Prev << "\n"
                 << Curr << "\n";
-            }
+            });
             FinalizedFuncs.emplace_back(std::move(Curr));
           }
         } else {
@@ -293,8 +297,8 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
         Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
       }
     }
-    OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
-      << Funcs.size() << " total\n";
+    Out << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
+        << Funcs.size() << " total\n";
   }
   return Error::success();
 }
@@ -494,8 +498,9 @@ llvm::Error GsymCreator::saveSegments(StringRef Path,
       GsymCreator *GC = ExpectedGC->get();
       if (GC == NULL)
         break; // We had not more functions to encode.
-      raw_null_ostream ErrorStrm;
-      llvm::Error Err = GC->finalize(ErrorStrm);
+      // Don't collect any messages at all
+      OutputAggregator Out(nullptr);
+      llvm::Error Err = GC->finalize(Out);
       if (Err)
         return Err;
       std::string SegmentedGsymPath;
diff --git a/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
index a60b2d386076..2500e172f52d 100644
--- a/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
@@ -14,8 +14,9 @@
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
+#include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 
 using namespace llvm;
 using namespace gsym;
@@ -68,7 +69,7 @@ static std::vector<uint8_t> getUUID(const object::ObjectFile &Obj) {
 }
 
 llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
-                                           raw_ostream *Log,
+                                           OutputAggregator &Out,
                                            GsymCreator &Gsym) {
   using namespace llvm::object;
 
@@ -99,8 +100,8 @@ llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
     const uint64_t size = IsELF ? ELFSymbolRef(Sym).getSize() : 0;
     Expected<StringRef> Name = Sym.getName();
     if (!Name) {
-      if (Log)
-        logAllUnhandledErrors(Name.takeError(), *Log,
+      if (Out.GetOS())
+        logAllUnhandledErrors(Name.takeError(), *Out.GetOS(),
                               "ObjectFileTransformer: ");
       else
         consumeError(Name.takeError());
@@ -114,8 +115,8 @@ llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
         FunctionInfo(*AddrOrErr, size, Gsym.insertString(*Name, NoCopy)));
   }
   size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
-  if (Log)
-    *Log << "Loaded " << FunctionsAddedCount
-         << " functions from symbol table.\n";
+  if (Out.GetOS())
+    *Out.GetOS() << "Loaded " << FunctionsAddedCount
+                 << " functions from symbol table.\n";
   return Error::success();
 }
diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index 2b318bf1c648..cb8afabec0db 100644
--- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -96,6 +96,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, ClrCall    , "clrcall", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Inline     , "inlinecall", OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, NearVector , "vectorcall", OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_CallingConv, Swift, "swiftcall", OS)
   }
   return OS;
 }
@@ -234,6 +235,9 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Rust, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, ObjC, OS)
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, ObjCpp, "ObjC++", OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, AliasObj, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Go, OS)
+    CASE_OUTPUT_ENUM_CLASS_STR(PDB_Lang, OldSwift, "Swift", OS)
   }
   return OS;
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 2fcdfcf9c066..0cf548ede938 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -133,38 +133,6 @@ const uint8_t
 namespace llvm {
 namespace jitlink {
 
-static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) {
-  using namespace riscv;
-  assert((E.getKind() == R_RISCV_PCREL_LO12_I ||
-          E.getKind() == R_RISCV_PCREL_LO12_S) &&
-         "Can only have high relocation for R_RISCV_PCREL_LO12_I or "
-         "R_RISCV_PCREL_LO12_S");
-
-  const Symbol &Sym = E.getTarget();
-  const Block &B = Sym.getBlock();
-  orc::ExecutorAddrDiff Offset = Sym.getOffset();
-
-  struct Comp {
-    bool operator()(const Edge &Lhs, orc::ExecutorAddrDiff Offset) {
-      return Lhs.getOffset() < Offset;
-    }
-    bool operator()(orc::ExecutorAddrDiff Offset, const Edge &Rhs) {
-      return Offset < Rhs.getOffset();
-    }
-  };
-
-  auto Bound =
-      std::equal_range(B.edges().begin(), B.edges().end(), Offset, Comp{});
-
-  for (auto It = Bound.first; It != Bound.second; ++It) {
-    if (It->getKind() == R_RISCV_PCREL_HI20)
-      return *It;
-  }
-
-  return make_error<JITLinkError>(
-      "No HI20 PCREL relocation type be found for LO12 PCREL relocation type");
-}
-
 static uint32_t extractBits(uint32_t Num, unsigned Low, unsigned Size) {
   return (Num & (((1ULL << Size) - 1) << Low)) >> Low;
 }
@@ -184,9 +152,43 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
 public:
   ELFJITLinker_riscv(std::unique_ptr<JITLinkContext> Ctx,
                      std::unique_ptr<LinkGraph> G, PassConfiguration PassConfig)
-      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {
+    JITLinkerBase::getPassConfig().PostAllocationPasses.push_back(
+        [this](LinkGraph &G) { return gatherRISCVPCRelHi20(G); });
+  }
 
 private:
+  DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *>
+      RelHi20;
+
+  Error gatherRISCVPCRelHi20(LinkGraph &G) {
+    for (Block *B : G.blocks())
+      for (Edge &E : B->edges())
+        if (E.getKind() == R_RISCV_PCREL_HI20)
+          RelHi20[{B, E.getOffset()}] = &E;
+
+    return Error::success();
+  }
+
+  Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) const {
+    using namespace riscv;
+    assert((E.getKind() == R_RISCV_PCREL_LO12_I ||
+            E.getKind() == R_RISCV_PCREL_LO12_S) &&
+           "Can only have high relocation for R_RISCV_PCREL_LO12_I or "
+           "R_RISCV_PCREL_LO12_S");
+
+    const Symbol &Sym = E.getTarget();
+    const Block &B = Sym.getBlock();
+    orc::ExecutorAddrDiff Offset = Sym.getOffset();
+
+    auto It = RelHi20.find({&B, Offset});
+    if (It != RelHi20.end())
+      return *It->second;
+
+    return make_error<JITLinkError>("No HI20 PCREL relocation type be found "
+                                    "for LO12 PCREL relocation type");
+  }
+
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
     using namespace riscv;
     using namespace llvm::support;
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 809b2d51f059..556031baaaed 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -312,10 +312,10 @@ private:
 
           Addend = SignExtend64(RI.r_symbolnum, 24);
 
+          ++RelItr;
           if (RelItr == RelEnd)
             return make_error<JITLinkError>("Unpaired Addend reloc at " +
                                             formatv("{0:x16}", FixupAddress));
-          ++RelItr;
           RI = getRelocationInfo(RelItr);
 
           MachORelocKind = getRelocationKind(RI);
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
index 460f4e1c448e..88cc3b04fb64 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace orc {
@@ -39,32 +40,38 @@ Error EPCDynamicLibrarySearchGenerator::tryToGenerate(
     LookupSymbols.add(KV.first, SymbolLookupFlags::WeaklyReferencedSymbol);
   }
 
-  SymbolMap NewSymbols;
-
   ExecutorProcessControl::LookupRequest Request(H, LookupSymbols);
-  auto Result = EPC.lookupSymbols(Request);
-  if (!Result)
-    return Result.takeError();
-
-  assert(Result->size() == 1 && "Results for more than one library returned");
-  assert(Result->front().size() == LookupSymbols.size() &&
-         "Result has incorrect number of elements");
-
-  auto ResultI = Result->front().begin();
-  for (auto &KV : LookupSymbols) {
-    if (ResultI->getAddress())
-      NewSymbols[KV.first] = *ResultI;
-    ++ResultI;
-  }
+  // Copy-capture LookupSymbols, since LookupRequest keeps a reference.
+  EPC.lookupSymbolsAsync(Request, [this, &JD, LS = std::move(LS),
+                                   LookupSymbols](auto Result) mutable {
+    if (!Result)
+      return LS.continueLookup(Result.takeError());
 
-  // If there were no resolved symbols bail out.
-  if (NewSymbols.empty())
-    return Error::success();
+    assert(Result->size() == 1 && "Results for more than one library returned");
+    assert(Result->front().size() == LookupSymbols.size() &&
+           "Result has incorrect number of elements");
+
+    SymbolMap NewSymbols;
+    auto ResultI = Result->front().begin();
+    for (auto &KV : LookupSymbols) {
+      if (ResultI->getAddress())
+        NewSymbols[KV.first] = *ResultI;
+      ++ResultI;
+    }
+
+    // If there were no resolved symbols bail out.
+    if (NewSymbols.empty())
+      return LS.continueLookup(Error::success());
+
+    // Define resolved symbols.
+    Error Err = AddAbsoluteSymbols
+                    ? AddAbsoluteSymbols(JD, std::move(NewSymbols))
+                    : JD.define(absoluteSymbols(std::move(NewSymbols)));
+
+    LS.continueLookup(std::move(Err));
+  });
 
-  // Define resolved symbols.
-  if (AddAbsoluteSymbols)
-    return AddAbsoluteSymbols(JD, std::move(NewSymbols));
-  return JD.define(absoluteSymbols(std::move(NewSymbols)));
+  return Error::success();
 }
 
 } // end namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
index da185c80c6c7..6a7cab4a5510 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
@@ -81,28 +81,38 @@ Expected<tpctypes::DylibHandle> EPCGenericDylibManager::open(StringRef Path,
   return H;
 }
 
-Expected<std::vector<ExecutorSymbolDef>>
-EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
-                               const SymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorSymbolDef>> Result(
-      (std::vector<ExecutorSymbolDef>()));
-  if (auto Err =
-          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
-              SAs.Lookup, Result, SAs.Instance, H, Lookup))
-    return std::move(Err);
-  return Result;
+void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
+                                         const SymbolLookupSet &Lookup,
+                                         SymbolLookupCompleteFn Complete) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+      SAs.Lookup,
+      [Complete = std::move(Complete)](
+          Error SerializationErr,
+          Expected<std::vector<ExecutorSymbolDef>> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          Complete(std::move(SerializationErr));
+        }
+        Complete(std::move(Result));
+      },
+      SAs.Instance, H, Lookup);
 }
 
-Expected<std::vector<ExecutorSymbolDef>>
-EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
-                               const RemoteSymbolLookupSet &Lookup) {
-  Expected<std::vector<ExecutorSymbolDef>> Result(
-      (std::vector<ExecutorSymbolDef>()));
-  if (auto Err =
-          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
-              SAs.Lookup, Result, SAs.Instance, H, Lookup))
-    return std::move(Err);
-  return Result;
+void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
+                                         const RemoteSymbolLookupSet &Lookup,
+                                         SymbolLookupCompleteFn Complete) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+      SAs.Lookup,
+      [Complete = std::move(Complete)](
+          Error SerializationErr,
+          Expected<std::vector<ExecutorSymbolDef>> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          Complete(std::move(SerializationErr));
+        }
+        Complete(std::move(Result));
+      },
+      SAs.Instance, H, Lookup);
 }
 
 } // end namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index f0c551cd7780..efafca949e61 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -89,8 +89,9 @@ SelfExecutorProcessControl::loadDylib(const char *DylibPath) {
   return ExecutorAddr::fromPtr(Dylib.getOSSpecificHandle());
 }
 
-Expected<std::vector<tpctypes::LookupResult>>
-SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
+void SelfExecutorProcessControl::lookupSymbolsAsync(
+    ArrayRef<LookupRequest> Request,
+    ExecutorProcessControl::SymbolLookupCompleteFn Complete) {
   std::vector<tpctypes::LookupResult> R;
 
   for (auto &Elem : Request) {
@@ -105,7 +106,8 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
         // FIXME: Collect all failing symbols before erroring out.
         SymbolNameVector MissingSymbols;
         MissingSymbols.push_back(Sym);
-        return make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols));
+        return Complete(
+            make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols)));
       }
       // FIXME: determine accurate JITSymbolFlags.
       R.back().push_back(
@@ -113,7 +115,7 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
     }
   }
 
-  return R;
+  Complete(std::move(R));
 }
 
 Expected<int32_t>
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 3d3ca891d881..a81019cb1dab 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -29,20 +29,37 @@ SimpleRemoteEPC::loadDylib(const char *DylibPath) {
   return DylibMgr->open(DylibPath, 0);
 }
 
-Expected<std::vector<tpctypes::LookupResult>>
-SimpleRemoteEPC::lookupSymbols(ArrayRef<LookupRequest> Request) {
-  std::vector<tpctypes::LookupResult> Result;
-
-  for (auto &Element : Request) {
-    if (auto R = DylibMgr->lookup(Element.Handle, Element.Symbols)) {
-      Result.push_back({});
-      Result.back().reserve(R->size());
-      for (auto Addr : *R)
-        Result.back().push_back(Addr);
-    } else
-      return R.takeError();
-  }
-  return std::move(Result);
+/// Async helper to chain together calls to DylibMgr::lookupAsync to fulfill all
+/// all the requests.
+/// FIXME: The dylib manager should support multiple LookupRequests natively.
+static void
+lookupSymbolsAsyncHelper(EPCGenericDylibManager &DylibMgr,
+                         ArrayRef<SimpleRemoteEPC::LookupRequest> Request,
+                         std::vector<tpctypes::LookupResult> Result,
+                         SimpleRemoteEPC::SymbolLookupCompleteFn Complete) {
+  if (Request.empty())
+    return Complete(std::move(Result));
+
+  auto &Element = Request.front();
+  DylibMgr.lookupAsync(Element.Handle, Element.Symbols,
+                       [&DylibMgr, Request, Complete = std::move(Complete),
+                        Result = std::move(Result)](auto R) mutable {
+                         if (!R)
+                           return Complete(R.takeError());
+                         Result.push_back({});
+                         Result.back().reserve(R->size());
+                         for (auto Addr : *R)
+                           Result.back().push_back(Addr);
+
+                         lookupSymbolsAsyncHelper(
+                             DylibMgr, Request.drop_front(), std::move(Result),
+                             std::move(Complete));
+                       });
+}
+
+void SimpleRemoteEPC::lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                                         SymbolLookupCompleteFn Complete) {
+  lookupSymbolsAsyncHelper(*DylibMgr, Request, {}, std::move(Complete));
 }
 
 Expected<int32_t> SimpleRemoteEPC::runAsMain(ExecutorAddr MainFnAddr,
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index c6ef332403fd..0ae720e8b7ce 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1406,6 +1406,99 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   }
 }
 
+static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
+  if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
+      &APF.getSemantics() == &APFloat::IEEEdouble()) {
+    // We would like to output the FP constant value in exponential notation,
+    // but we cannot do this if doing so will lose precision.  Check here to
+    // make sure that we only output it in exponential format if we can parse
+    // the value back and get the same value.
+    //
+    bool ignored;
+    bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble();
+    bool isInf = APF.isInfinity();
+    bool isNaN = APF.isNaN();
+
+    if (!isInf && !isNaN) {
+      double Val = APF.convertToDouble();
+      SmallString<128> StrVal;
+      APF.toString(StrVal, 6, 0, false);
+      // Check to make sure that the stringized number is not some string like
+      // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+      // that the string matches the "[-+]?[0-9]" regex.
+      //
+      assert((isDigit(StrVal[0]) ||
+              ((StrVal[0] == '-' || StrVal[0] == '+') && isDigit(StrVal[1]))) &&
+             "[-+]?[0-9] regex does not match!");
+      // Reparse stringized version!
+      if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
+        Out << StrVal;
+        return;
+      }
+    }
+
+    // Otherwise we could not reparse it to exactly the same value, so we must
+    // output the string in hexadecimal format!  Note that loading and storing
+    // floating point types changes the bits of NaNs on some hosts, notably
+    // x86, so we must not use these types.
+    static_assert(sizeof(double) == sizeof(uint64_t),
+                  "assuming that double is 64 bits!");
+    APFloat apf = APF;
+
+    // Floats are represented in ASCII IR as double, convert.
+    // FIXME: We should allow 32-bit hex float and remove this.
+    if (!isDouble) {
+      // A signaling NaN is quieted on conversion, so we need to recreate the
+      // expected value after convert (quiet bit of the payload is clear).
+      bool IsSNAN = apf.isSignaling();
+      apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                  &ignored);
+      if (IsSNAN) {
+        APInt Payload = apf.bitcastToAPInt();
+        apf =
+            APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(), &Payload);
+      }
+    }
+
+    Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
+    return;
+  }
+
+  // Either half, bfloat or some form of long double.
+  // These appear as a magic letter identifying the type, then a
+  // fixed number of hex digits.
+  Out << "0x";
+  APInt API = APF.bitcastToAPInt();
+  if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) {
+    Out << 'K';
+    Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::IEEEquad()) {
+    Out << 'L';
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) {
+    Out << 'M';
+    Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+    Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) {
+    Out << 'H';
+    Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                /*Upper=*/true);
+  } else if (&APF.getSemantics() == &APFloat::BFloat()) {
+    Out << 'R';
+    Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                /*Upper=*/true);
+  } else
+    llvm_unreachable("Unsupported floating point type");
+}
+
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                                   AsmWriterContext &WriterCtx) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
@@ -1418,94 +1511,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    const APFloat &APF = CFP->getValueAPF();
-    if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
-        &APF.getSemantics() == &APFloat::IEEEdouble()) {
-      // We would like to output the FP constant value in exponential notation,
-      // but we cannot do this if doing so will lose precision.  Check here to
-      // make sure that we only output it in exponential format if we can parse
-      // the value back and get the same value.
-      //
-      bool ignored;
-      bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble();
-      bool isInf = APF.isInfinity();
-      bool isNaN = APF.isNaN();
-      if (!isInf && !isNaN) {
-        double Val = APF.convertToDouble();
-        SmallString<128> StrVal;
-        APF.toString(StrVal, 6, 0, false);
-        // Check to make sure that the stringized number is not some string like
-        // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
-        // that the string matches the "[-+]?[0-9]" regex.
-        //
-        assert((isDigit(StrVal[0]) || ((StrVal[0] == '-' || StrVal[0] == '+') &&
-                                       isDigit(StrVal[1]))) &&
-               "[-+]?[0-9] regex does not match!");
-        // Reparse stringized version!
-        if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
-          Out << StrVal;
-          return;
-        }
-      }
-      // Otherwise we could not reparse it to exactly the same value, so we must
-      // output the string in hexadecimal format!  Note that loading and storing
-      // floating point types changes the bits of NaNs on some hosts, notably
-      // x86, so we must not use these types.
-      static_assert(sizeof(double) == sizeof(uint64_t),
-                    "assuming that double is 64 bits!");
-      APFloat apf = APF;
-      // Floats are represented in ASCII IR as double, convert.
-      // FIXME: We should allow 32-bit hex float and remove this.
-      if (!isDouble) {
-        // A signaling NaN is quieted on conversion, so we need to recreate the
-        // expected value after convert (quiet bit of the payload is clear).
-        bool IsSNAN = apf.isSignaling();
-        apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                    &ignored);
-        if (IsSNAN) {
-          APInt Payload = apf.bitcastToAPInt();
-          apf = APFloat::getSNaN(APFloat::IEEEdouble(), apf.isNegative(),
-                                 &Payload);
-        }
-      }
-      Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
-      return;
-    }
-
-    // Either half, bfloat or some form of long double.
-    // These appear as a magic letter identifying the type, then a
-    // fixed number of hex digits.
-    Out << "0x";
-    APInt API = APF.bitcastToAPInt();
-    if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) {
-      Out << 'K';
-      Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      return;
-    } else if (&APF.getSemantics() == &APFloat::IEEEquad()) {
-      Out << 'L';
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) {
-      Out << 'M';
-      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) {
-      Out << 'H';
-      Out << format_hex_no_prefix(API.getZExtValue(), 4,
-                                  /*Upper=*/true);
-    } else if (&APF.getSemantics() == &APFloat::BFloat()) {
-      Out << 'R';
-      Out << format_hex_no_prefix(API.getZExtValue(), 4,
-                                  /*Upper=*/true);
-    } else
-      llvm_unreachable("Unsupported floating point type");
+    WriteAPFloatInternal(Out, CFP->getValueAPF());
     return;
   }
 
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bb55f48df4b3..bf02eba9fb44 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -34,7 +34,7 @@ cl::opt<bool>
     UseNewDbgInfoFormat("experimental-debuginfo-iterators",
                         cl::desc("Enable communicating debuginfo positions "
                                  "through iterators, eliminating intrinsics"),
-                        cl::init(false));
+                        cl::init(true));
 
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
@@ -677,7 +677,9 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
 
 BasicBlock::iterator BasicBlock::erase(BasicBlock::iterator FromIt,
                                        BasicBlock::iterator ToIt) {
-  return InstList.erase(FromIt, ToIt);
+  for (Instruction &I : make_early_inc_range(make_range(FromIt, ToIt)))
+    I.eraseFromParent();
+  return ToIt;
 }
 
 void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) {
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index cbb64b299e64..3394a1ec8dc4 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1467,7 +1467,22 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
   if (isSingleElement() && getSingleElement()->isAllOnes())
     return Other.binaryNot();
 
-  return fromKnownBits(toKnownBits() ^ Other.toKnownBits(), /*IsSigned*/false);
+  KnownBits LHSKnown = toKnownBits();
+  KnownBits RHSKnown = Other.toKnownBits();
+  KnownBits Known = LHSKnown ^ RHSKnown;
+  ConstantRange CR = fromKnownBits(Known, /*IsSigned*/ false);
+  // Typically the following code doesn't improve the result if BW = 1.
+  if (getBitWidth() == 1)
+    return CR;
+
+  // If LHS is known to be the subset of RHS, treat LHS ^ RHS as RHS -nuw/nsw
+  // LHS. If RHS is known to be the subset of LHS, treat LHS ^ RHS as LHS
+  // -nuw/nsw RHS.
+  if ((~LHSKnown.Zero).isSubsetOf(RHSKnown.One))
+    CR = CR.intersectWith(Other.sub(*this), PreferredRangeType::Unsigned);
+  else if ((~RHSKnown.Zero).isSubsetOf(LHSKnown.One))
+    CR = CR.intersectWith(this->sub(Other), PreferredRangeType::Unsigned);
+  return CR;
 }
 
 ConstantRange
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index fb30fbce0ba2..d6d159ab8b9e 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -1805,6 +1805,14 @@ LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB) {
   return wrap(BlockAddress::get(unwrap<Function>(F), unwrap(BB)));
 }
 
+LLVMValueRef LLVMGetBlockAddressFunction(LLVMValueRef BlockAddr) {
+  return wrap(unwrap<BlockAddress>(BlockAddr)->getFunction());
+}
+
+LLVMBasicBlockRef LLVMGetBlockAddressBasicBlock(LLVMValueRef BlockAddr) {
+  return wrap(unwrap<BlockAddress>(BlockAddr)->getBasicBlock());
+}
+
 /*--.. Operations on global variables, functions, and aliases (globals) ....--*/
 
 LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index d8c1b0d534f6..eaa5cb36a49c 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2214,10 +2214,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
-  // FIXME: https://github.com/llvm/llvm-project/issues/76545
-  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
-    return /*Changed*/ false;
-
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 22e2455462bf..fceffbc3cea6 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/IntrinsicsVE.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 16a89534b4b3..caf8fe654a36 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -108,6 +108,15 @@ bool GEPOperator::accumulateConstantOffset(
 bool GEPOperator::accumulateConstantOffset(
     Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
     APInt &Offset, function_ref<bool(Value &, APInt &)> ExternalAnalysis) {
+  // Fast path for canonical getelementptr i8 form.
+  if (SourceType->isIntegerTy(8) && !ExternalAnalysis) {
+    if (auto *CI = dyn_cast<ConstantInt>(Index.front())) {
+      Offset += CI->getValue().sextOrTrunc(Offset.getBitWidth());
+      return true;
+    }
+    return false;
+  }
+
   bool UsedExternalAnalysis = false;
   auto AccumulateOffset = [&](APInt Index, uint64_t Size) -> bool {
     Index = Index.sextOrTrunc(Offset.getBitWidth());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index ccc4276e36da..7b3a7590dfa7 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -243,19 +243,23 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (!Conf.SampleProfile.empty())
     PGOOpt = PGOOptions(Conf.SampleProfile, "", Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::SampleUse,
-                        PGOOptions::NoCSAction, true);
+                        PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
   else if (Conf.RunCSIRInstr) {
     PGOOpt = PGOOptions("", Conf.CSIRProfile, Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::IRUse,
-                        PGOOptions::CSIRInstr, Conf.AddFSDiscriminator);
+                        PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default,
+                        Conf.AddFSDiscriminator);
   } else if (!Conf.CSIRProfile.empty()) {
     PGOOpt = PGOOptions(Conf.CSIRProfile, "", Conf.ProfileRemapping,
                         /*MemoryProfile=*/"", FS, PGOOptions::IRUse,
-                        PGOOptions::CSIRUse, Conf.AddFSDiscriminator);
+                        PGOOptions::CSIRUse, PGOOptions::ColdFuncOpt::Default,
+                        Conf.AddFSDiscriminator);
     NoPGOWarnMismatch = !Conf.PGOWarnMismatch;
   } else if (Conf.AddFSDiscriminator) {
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
-                        PGOOptions::NoAction, PGOOptions::NoCSAction, true);
+                        PGOOptions::NoAction, PGOOptions::NoCSAction,
+                        PGOOptions::ColdFuncOpt::Default, true);
   }
   TM->setPGOOption(PGOOpt);
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 36f799446a04..1b3a58298ec0 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -182,7 +182,9 @@ static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
   case FileFormat::Binary:
     return std::make_unique<BinaryWriter>(Obj, Out, Config);
   case FileFormat::IHex:
-    return std::make_unique<IHexWriter>(Obj, Out);
+    return std::make_unique<IHexWriter>(Obj, Out, Config.OutputFilename);
+  case FileFormat::SREC:
+    return std::make_unique<SRECWriter>(Obj, Out, Config.OutputFilename);
   default:
     return createELFWriter(Config, Obj, Out, OutputElfType);
   }
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index c8b66d6fcb5e..b9b91670b998 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -2707,10 +2707,52 @@ Error BinaryWriter::finalize() {
   return Error::success();
 }
 
-bool IHexWriter::SectionCompare::operator()(const SectionBase *Lhs,
-                                            const SectionBase *Rhs) const {
-  return (sectionPhysicalAddr(Lhs) & 0xFFFFFFFFU) <
-         (sectionPhysicalAddr(Rhs) & 0xFFFFFFFFU);
+Error ASCIIHexWriter::checkSection(const SectionBase &S) const {
+  if (addressOverflows32bit(S.Addr) ||
+      addressOverflows32bit(S.Addr + S.Size - 1))
+    return createStringError(
+        errc::invalid_argument,
+        "section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
+        S.Name.c_str(), S.Addr, S.Addr + S.Size - 1);
+  return Error::success();
+}
+
+Error ASCIIHexWriter::finalize() {
+  // We can't write 64-bit addresses.
+  if (addressOverflows32bit(Obj.Entry))
+    return createStringError(errc::invalid_argument,
+                             "entry point address 0x%llx overflows 32 bits",
+                             Obj.Entry);
+
+  for (const SectionBase &S : Obj.sections()) {
+    if ((S.Flags & ELF::SHF_ALLOC) && S.Type != ELF::SHT_NOBITS && S.Size > 0) {
+      if (Error E = checkSection(S))
+        return E;
+      Sections.push_back(&S);
+    }
+  }
+
+  llvm::sort(Sections, [](const SectionBase *A, const SectionBase *B) {
+    return sectionPhysicalAddr(A) < sectionPhysicalAddr(B);
+  });
+
+  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
+      WritableMemoryBuffer::getNewMemBuffer(0);
+  if (!EmptyBuffer)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of 0 bytes");
+
+  Expected<size_t> ExpTotalSize = getTotalSize(*EmptyBuffer);
+  if (!ExpTotalSize)
+    return ExpTotalSize.takeError();
+  TotalSize = *ExpTotalSize;
+
+  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
+  if (!Buf)
+    return createStringError(errc::not_enough_memory,
+                             "failed to allocate memory buffer of 0x" +
+                                 Twine::utohexstr(TotalSize) + " bytes");
+  return Error::success();
 }
 
 uint64_t IHexWriter::writeEntryPointRecord(uint8_t *Buf) {
@@ -2740,6 +2782,20 @@ uint64_t IHexWriter::writeEndOfFileRecord(uint8_t *Buf) {
   return HexData.size();
 }
 
+Expected<size_t>
+IHexWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
+  IHexSectionWriterBase LengthCalc(EmptyBuffer);
+  for (const SectionBase *Sec : Sections)
+    if (Error Err = Sec->accept(LengthCalc))
+      return std::move(Err);
+
+  // We need space to write section records + StartAddress record
+  // (if start adress is not zero) + EndOfFile record.
+  return LengthCalc.getBufferOffset() +
+         (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
+         IHexRecord::getLineLength(0);
+}
+
 Error IHexWriter::write() {
   IHexSectionWriter Writer(*Buf);
   // Write sections.
@@ -2762,54 +2818,196 @@ Error IHexWriter::write() {
   return Error::success();
 }
 
-Error IHexWriter::checkSection(const SectionBase &Sec) {
-  uint64_t Addr = sectionPhysicalAddr(&Sec);
-  if (addressOverflows32bit(Addr) || addressOverflows32bit(Addr + Sec.Size - 1))
-    return createStringError(
-        errc::invalid_argument,
-        "Section '%s' address range [0x%llx, 0x%llx] is not 32 bit",
-        Sec.Name.c_str(), Addr, Addr + Sec.Size - 1);
+Error SRECSectionWriterBase::visit(const StringTableSection &Sec) {
+  // Check that the sizer has already done its work.
+  assert(Sec.Size == Sec.StrTabBuilder.getSize() &&
+         "Expected section size to have been finalized");
+  // We don't need to write anything here because the real writer has already
+  // done it.
   return Error::success();
 }
 
-Error IHexWriter::finalize() {
-  // We can't write 64-bit addresses.
-  if (addressOverflows32bit(Obj.Entry))
-    return createStringError(errc::invalid_argument,
-                             "Entry point address 0x%llx overflows 32 bits",
-                             Obj.Entry);
+Error SRECSectionWriterBase::visit(const Section &Sec) {
+  writeSection(Sec, Sec.Contents);
+  return Error::success();
+}
 
-  for (const SectionBase &Sec : Obj.sections())
-    if ((Sec.Flags & ELF::SHF_ALLOC) && Sec.Type != ELF::SHT_NOBITS &&
-        Sec.Size > 0) {
-      if (Error E = checkSection(Sec))
-        return E;
-      Sections.insert(&Sec);
-    }
+Error SRECSectionWriterBase::visit(const OwnedDataSection &Sec) {
+  writeSection(Sec, Sec.Data);
+  return Error::success();
+}
 
-  std::unique_ptr<WritableMemoryBuffer> EmptyBuffer =
-      WritableMemoryBuffer::getNewMemBuffer(0);
-  if (!EmptyBuffer)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of 0 bytes");
+Error SRECSectionWriterBase::visit(const DynamicRelocationSection &Sec) {
+  writeSection(Sec, Sec.Contents);
+  return Error::success();
+}
+
+void SRECSectionWriter::writeRecord(SRecord &Record, uint64_t Off) {
+  SRecLineData Data = Record.toString();
+  memcpy(Out.getBufferStart() + Off, Data.data(), Data.size());
+}
+
+void SRECSectionWriterBase::writeRecords(uint32_t Entry) {
+  // The ELF header could contain an entry point outside of the sections we have
+  // seen that does not fit the current record Type.
+  Type = std::max(Type, SRecord::getType(Entry));
+  uint64_t Off = HeaderSize;
+  for (SRecord &Record : Records) {
+    Record.Type = Type;
+    writeRecord(Record, Off);
+    Off += Record.getSize();
+  }
+  Offset = Off;
+}
+
+void SRECSectionWriterBase::writeSection(const SectionBase &S,
+                                         ArrayRef<uint8_t> Data) {
+  const uint32_t ChunkSize = 16;
+  uint32_t Address = sectionPhysicalAddr(&S);
+  uint32_t EndAddr = Address + S.Size - 1;
+  Type = std::max(SRecord::getType(EndAddr), Type);
+  while (!Data.empty()) {
+    uint64_t DataSize = std::min<uint64_t>(Data.size(), ChunkSize);
+    SRecord Record{Type, Address, Data.take_front(DataSize)};
+    Records.push_back(Record);
+    Data = Data.drop_front(DataSize);
+    Address += DataSize;
+  }
+}
+
+Error SRECSectionWriter::visit(const StringTableSection &Sec) {
+  assert(Sec.Size == Sec.StrTabBuilder.getSize() &&
+         "Section size does not match the section's string table builder size");
+  std::vector<uint8_t> Data(Sec.Size);
+  Sec.StrTabBuilder.write(Data.data());
+  writeSection(Sec, Data);
+  return Error::success();
+}
+
+SRecLineData SRecord::toString() const {
+  SRecLineData Line(getSize());
+  auto *Iter = Line.begin();
+  *Iter++ = 'S';
+  *Iter++ = '0' + Type;
+  // Write 1 byte (2 hex characters) record count.
+  Iter = toHexStr(getCount(), Iter, 2);
+  // Write the address field with length depending on record type.
+  Iter = toHexStr(Address, Iter, getAddressSize());
+  // Write data byte by byte.
+  for (uint8_t X : Data)
+    Iter = toHexStr(X, Iter, 2);
+  // Write the 1 byte checksum.
+  Iter = toHexStr(getChecksum(), Iter, 2);
+  *Iter++ = '\r';
+  *Iter++ = '\n';
+  assert(Iter == Line.end());
+  return Line;
+}
+
+uint8_t SRecord::getChecksum() const {
+  uint32_t Sum = getCount();
+  Sum += (Address >> 24) & 0xFF;
+  Sum += (Address >> 16) & 0xFF;
+  Sum += (Address >> 8) & 0xFF;
+  Sum += Address & 0xFF;
+  for (uint8_t Byte : Data)
+    Sum += Byte;
+  return 0xFF - (Sum & 0xFF);
+}
+
+size_t SRecord::getSize() const {
+  // Type, Count, Checksum, and CRLF are two characters each.
+  return 2 + 2 + getAddressSize() + Data.size() * 2 + 2 + 2;
+}
+
+uint8_t SRecord::getAddressSize() const {
+  switch (Type) {
+  case Type::S2:
+    return 6;
+  case Type::S3:
+    return 8;
+  case Type::S7:
+    return 8;
+  case Type::S8:
+    return 6;
+  default:
+    return 4;
+  }
+}
+
+uint8_t SRecord::getCount() const {
+  uint8_t DataSize = Data.size();
+  uint8_t ChecksumSize = 1;
+  return getAddressSize() / 2 + DataSize + ChecksumSize;
+}
+
+uint8_t SRecord::getType(uint32_t Address) {
+  if (isUInt<16>(Address))
+    return SRecord::S1;
+  if (isUInt<24>(Address))
+    return SRecord::S2;
+  return SRecord::S3;
+}
 
-  IHexSectionWriterBase LengthCalc(*EmptyBuffer);
+SRecord SRecord::getHeader(StringRef FileName) {
+  // Header is a record with Type S0, Address 0, and Data that is a
+  // vendor-specific text comment. For the comment we will use the output file
+  // name truncated to 40 characters to match the behavior of GNU objcopy.
+  StringRef HeaderContents = FileName.slice(0, 40);
+  ArrayRef<uint8_t> Data(
+      reinterpret_cast<const uint8_t *>(HeaderContents.data()),
+      HeaderContents.size());
+  return {SRecord::S0, 0, Data};
+}
+
+size_t SRECWriter::writeHeader(uint8_t *Buf) {
+  SRecLineData Record = SRecord::getHeader(OutputFileName).toString();
+  memcpy(Buf, Record.data(), Record.size());
+  return Record.size();
+}
+
+size_t SRECWriter::writeTerminator(uint8_t *Buf, uint8_t Type) {
+  assert(Type >= SRecord::S7 && Type <= SRecord::S9 &&
+         "Invalid record type for terminator");
+  uint32_t Entry = Obj.Entry;
+  SRecLineData Data = SRecord{Type, Entry, {}}.toString();
+  memcpy(Buf, Data.data(), Data.size());
+  return Data.size();
+}
+
+Expected<size_t>
+SRECWriter::getTotalSize(WritableMemoryBuffer &EmptyBuffer) const {
+  SRECSizeCalculator SizeCalc(EmptyBuffer, 0);
   for (const SectionBase *Sec : Sections)
-    if (Error Err = Sec->accept(LengthCalc))
-      return Err;
+    if (Error Err = Sec->accept(SizeCalc))
+      return std::move(Err);
 
-  // We need space to write section records + StartAddress record
-  // (if start adress is not zero) + EndOfFile record.
-  TotalSize = LengthCalc.getBufferOffset() +
-              (Obj.Entry ? IHexRecord::getLineLength(4) : 0) +
-              IHexRecord::getLineLength(0);
+  SizeCalc.writeRecords(Obj.Entry);
+  // We need to add the size of the Header and Terminator records.
+  SRecord Header = SRecord::getHeader(OutputFileName);
+  uint8_t TerminatorType = 10 - SizeCalc.getType();
+  SRecord Terminator = {TerminatorType, static_cast<uint32_t>(Obj.Entry), {}};
+  return Header.getSize() + SizeCalc.getBufferOffset() + Terminator.getSize();
+}
 
-  Buf = WritableMemoryBuffer::getNewMemBuffer(TotalSize);
-  if (!Buf)
-    return createStringError(errc::not_enough_memory,
-                             "failed to allocate memory buffer of " +
-                                 Twine::utohexstr(TotalSize) + " bytes");
+Error SRECWriter::write() {
+  uint32_t HeaderSize =
+      writeHeader(reinterpret_cast<uint8_t *>(Buf->getBufferStart()));
+  SRECSectionWriter Writer(*Buf, HeaderSize);
+  for (const SectionBase *S : Sections) {
+    if (Error E = S->accept(Writer))
+      return E;
+  }
+  Writer.writeRecords(Obj.Entry);
+  uint64_t Offset = Writer.getBufferOffset();
 
+  // An S1 record terminates with an S9 record, S2 with S8, and S3 with S7.
+  uint8_t TerminatorType = 10 - Writer.getType();
+  Offset += writeTerminator(
+      reinterpret_cast<uint8_t *>(Buf->getBufferStart() + Offset),
+      TerminatorType);
+  assert(Offset == TotalSize);
+  Out.write(Buf->getBufferStart(), Buf->getBufferSize());
   return Error::success();
 }
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 95bea0964eae..7a2e20d82d11 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -172,6 +172,9 @@ public:
   friend class SectionWriter;                                                  \
   friend class IHexSectionWriterBase;                                          \
   friend class IHexSectionWriter;                                              \
+  friend class SRECSectionWriter;                                              \
+  friend class SRECSectionWriterBase;                                          \
+  friend class SRECSizeCalculator;                                             \
   template <class ELFT> friend class ELFSectionWriter;                         \
   template <class ELFT> friend class ELFSectionSizer;
 
@@ -371,23 +374,136 @@ public:
       : Writer(Obj, Out), GapFill(Config.GapFill), PadTo(Config.PadTo) {}
 };
 
-class IHexWriter : public Writer {
-  struct SectionCompare {
-    bool operator()(const SectionBase *Lhs, const SectionBase *Rhs) const;
-  };
+// A base class for writing ascii hex formats such as srec and ihex.
+class ASCIIHexWriter : public Writer {
+public:
+  ASCIIHexWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile)
+      : Writer(Obj, OS), OutputFileName(OutputFile) {}
+  Error finalize() override;
 
-  std::set<const SectionBase *, SectionCompare> Sections;
+protected:
+  StringRef OutputFileName;
   size_t TotalSize = 0;
+  std::vector<const SectionBase *> Sections;
+
+  Error checkSection(const SectionBase &S) const;
+  virtual Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const = 0;
+};
+
+class IHexWriter : public ASCIIHexWriter {
+public:
+  Error write() override;
+  IHexWriter(Object &Obj, raw_ostream &Out, StringRef OutputFile)
+      : ASCIIHexWriter(Obj, Out, OutputFile) {}
 
-  Error checkSection(const SectionBase &Sec);
+private:
   uint64_t writeEntryPointRecord(uint8_t *Buf);
   uint64_t writeEndOfFileRecord(uint8_t *Buf);
+  Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override;
+};
 
+class SRECWriter : public ASCIIHexWriter {
 public:
-  ~IHexWriter() {}
-  Error finalize() override;
+  SRECWriter(Object &Obj, raw_ostream &OS, StringRef OutputFile)
+      : ASCIIHexWriter(Obj, OS, OutputFile) {}
   Error write() override;
-  IHexWriter(Object &Obj, raw_ostream &Out) : Writer(Obj, Out) {}
+
+private:
+  size_t writeHeader(uint8_t *Buf);
+  size_t writeTerminator(uint8_t *Buf, uint8_t Type);
+  Expected<size_t>
+  getTotalSize(WritableMemoryBuffer &EmptyBuffer) const override;
+};
+
+using SRecLineData = SmallVector<char, 64>;
+struct SRecord {
+  uint8_t Type;
+  uint32_t Address;
+  ArrayRef<uint8_t> Data;
+  SRecLineData toString() const;
+  uint8_t getCount() const;
+  // Get address size in characters.
+  uint8_t getAddressSize() const;
+  uint8_t getChecksum() const;
+  size_t getSize() const;
+  static SRecord getHeader(StringRef FileName);
+  static uint8_t getType(uint32_t Address);
+
+  enum Type : uint8_t {
+    // Vendor specific text comment.
+    S0 = 0,
+    // Data that starts at a 16 bit address.
+    S1 = 1,
+    // Data that starts at a 24 bit address.
+    S2 = 2,
+    // Data that starts at a 32 bit address.
+    S3 = 3,
+    // Reserved.
+    S4 = 4,
+    // 16 bit count of S1/S2/S3 records (optional).
+    S5 = 5,
+    // 32 bit count of S1/S2/S3 records (optional).
+    S6 = 6,
+    // Terminates a series of S3 records.
+    S7 = 7,
+    // Terminates a series of S2 records.
+    S8 = 8,
+    // Terminates a series of S1 records.
+    S9 = 9
+  };
+};
+
+class SRECSectionWriterBase : public BinarySectionWriter {
+public:
+  explicit SRECSectionWriterBase(WritableMemoryBuffer &Buf,
+                                 uint64_t StartOffset)
+      : BinarySectionWriter(Buf), Offset(StartOffset), HeaderSize(StartOffset) {
+  }
+
+  using BinarySectionWriter::visit;
+
+  void writeRecords(uint32_t Entry);
+  uint64_t getBufferOffset() const { return Offset; }
+  Error visit(const Section &S) override;
+  Error visit(const OwnedDataSection &S) override;
+  Error visit(const StringTableSection &S) override;
+  Error visit(const DynamicRelocationSection &S) override;
+  uint8_t getType() const { return Type; };
+
+protected:
+  // Offset in the output buffer.
+  uint64_t Offset;
+  // Sections start after the header.
+  uint64_t HeaderSize;
+  // Type of records to write.
+  uint8_t Type = SRecord::S1;
+  std::vector<SRecord> Records;
+
+  void writeSection(const SectionBase &S, ArrayRef<uint8_t> Data);
+  virtual void writeRecord(SRecord &Record, uint64_t Off) = 0;
+};
+
+// An SRECSectionWriterBase that visits sections but does not write anything.
+// This class is only used to calculate the size of the output file.
+class SRECSizeCalculator : public SRECSectionWriterBase {
+public:
+  SRECSizeCalculator(WritableMemoryBuffer &EmptyBuffer, uint64_t Offset)
+      : SRECSectionWriterBase(EmptyBuffer, Offset) {}
+
+protected:
+  void writeRecord(SRecord &Record, uint64_t Off) override {}
+};
+
+class SRECSectionWriter : public SRECSectionWriterBase {
+public:
+  SRECSectionWriter(WritableMemoryBuffer &Buf, uint64_t Offset)
+      : SRECSectionWriterBase(Buf, Offset) {}
+  Error visit(const StringTableSection &Sec) override;
+
+protected:
+  void writeRecord(SRecord &Record, uint64_t Off) override;
 };
 
 class SectionBase {
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index d7d26f4f4180..f6f6cf2a1602 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -71,6 +71,12 @@ StringRef COFFImportFile::getExportName() const {
     name = ltrim1(name, "?@_");
     name = name.substr(0, name.find('@'));
     break;
+  case IMPORT_NAME_EXPORTAS: {
+    // Skip DLL name
+    name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1);
+    name = name.split('\0').second.split('\0').first;
+    break;
+  }
   default:
     break;
   }
@@ -209,6 +215,7 @@ public:
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
                                      ImportType Type, ImportNameType NameType,
+                                     StringRef ExportName,
                                      MachineTypes Machine);
 
   // Create a weak external file which is described in PE/COFF Aux Format 3.
@@ -500,12 +507,13 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   return {MemoryBufferRef{F, ImportName}};
 }
 
-NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
-                                                  uint16_t Ordinal,
-                                                  ImportType ImportType,
-                                                  ImportNameType NameType,
-                                                  MachineTypes Machine) {
+NewArchiveMember
+ObjectFactory::createShortImport(StringRef Sym, uint16_t Ordinal,
+                                 ImportType ImportType, ImportNameType NameType,
+                                 StringRef ExportName, MachineTypes Machine) {
   size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
+  if (!ExportName.empty())
+    ImpSize += ExportName.size() + 1;
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
   memset(Buf, 0, Size);
@@ -525,6 +533,10 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
   memcpy(P, Sym.data(), Sym.size());
   P += Sym.size() + 1;
   memcpy(P, ImportName.data(), ImportName.size());
+  if (!ExportName.empty()) {
+    P += ImportName.size() + 1;
+    memcpy(P, ExportName.data(), ExportName.size());
+  }
 
   return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
 }
@@ -613,7 +625,8 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
 Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
-                         MachineTypes Machine, bool MinGW) {
+                         MachineTypes Machine, bool MinGW,
+                         ArrayRef<COFFShortExport> NativeExports) {
 
   MachineTypes NativeMachine =
       isArm64EC(Machine) ? IMAGE_FILE_MACHINE_ARM64 : Machine;
@@ -630,39 +643,73 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
   std::vector<uint8_t> NullThunk;
   Members.push_back(OF.createNullThunk(NullThunk));
 
-  for (const COFFShortExport &E : Exports) {
-    if (E.Private)
-      continue;
-
-    ImportType ImportType = IMPORT_CODE;
-    if (E.Data)
-      ImportType = IMPORT_DATA;
-    if (E.Constant)
-      ImportType = IMPORT_CONST;
-
-    StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
-    ImportNameType NameType = E.Noname
-                                  ? IMPORT_ORDINAL
-                                  : getNameType(SymbolName, E.Name,
-                                                Machine, MinGW);
-    Expected<std::string> Name = E.ExtName.empty()
-                                     ? std::string(SymbolName)
-                                     : replace(SymbolName, E.Name, E.ExtName);
-
-    if (!Name)
-      return Name.takeError();
-
-    if (!E.AliasTarget.empty() && *Name != E.AliasTarget) {
-      Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, false, Machine));
-      Members.push_back(
-          OF.createWeakExternal(E.AliasTarget, *Name, true, Machine));
-      continue;
+  auto addExports = [&](ArrayRef<COFFShortExport> Exp,
+                        MachineTypes M) -> Error {
+    for (const COFFShortExport &E : Exp) {
+      if (E.Private)
+        continue;
+
+      ImportType ImportType = IMPORT_CODE;
+      if (E.Data)
+        ImportType = IMPORT_DATA;
+      if (E.Constant)
+        ImportType = IMPORT_CONST;
+
+      StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
+      std::string Name;
+
+      if (E.ExtName.empty()) {
+        Name = std::string(SymbolName);
+      } else {
+        Expected<std::string> ReplacedName =
+            replace(SymbolName, E.Name, E.ExtName);
+        if (!ReplacedName)
+          return ReplacedName.takeError();
+        Name.swap(*ReplacedName);
+      }
+
+      if (!E.AliasTarget.empty() && Name != E.AliasTarget) {
+        Members.push_back(OF.createWeakExternal(E.AliasTarget, Name, false, M));
+        Members.push_back(OF.createWeakExternal(E.AliasTarget, Name, true, M));
+        continue;
+      }
+
+      ImportNameType NameType;
+      std::string ExportName;
+      if (E.Noname) {
+        NameType = IMPORT_ORDINAL;
+      } else if (!E.ExportAs.empty()) {
+        NameType = IMPORT_NAME_EXPORTAS;
+        ExportName = E.ExportAs;
+      } else {
+        NameType = getNameType(SymbolName, E.Name, M, MinGW);
+      }
+
+      // On ARM64EC, use EXPORTAS to import demangled name for mangled symbols.
+      if (ImportType == IMPORT_CODE && isArm64EC(M)) {
+        if (std::optional<std::string> MangledName =
+                getArm64ECMangledFunctionName(Name)) {
+          if (ExportName.empty()) {
+            NameType = IMPORT_NAME_EXPORTAS;
+            ExportName.swap(Name);
+          }
+          Name = std::move(*MangledName);
+        } else if (ExportName.empty()) {
+          NameType = IMPORT_NAME_EXPORTAS;
+          ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
+        }
+      }
+
+      Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType,
+                                             NameType, ExportName, M));
     }
+    return Error::success();
+  };
 
-    Members.push_back(
-        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine));
-  }
+  if (Error e = addExports(Exports, Machine))
+    return e;
+  if (Error e = addExports(NativeExports, NativeMachine))
+    return e;
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
                       MinGW ? object::Archive::K_GNU : object::Archive::K_COFF,
diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp
index 35e6ab837b2a..549348ac9226 100644
--- a/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -39,6 +39,7 @@ enum Kind {
   KwConstant,
   KwData,
   KwExports,
+  KwExportAs,
   KwHeapsize,
   KwLibrary,
   KwName,
@@ -116,6 +117,7 @@ public:
                    .Case("CONSTANT", KwConstant)
                    .Case("DATA", KwData)
                    .Case("EXPORTS", KwExports)
+                   .Case("EXPORTAS", KwExportAs)
                    .Case("HEAPSIZE", KwHeapsize)
                    .Case("LIBRARY", KwLibrary)
                    .Case("NAME", KwName)
@@ -284,7 +286,16 @@ private:
           E.AliasTarget = std::string("_").append(E.AliasTarget);
         continue;
       }
-      unget();
+      // EXPORTAS must be at the end of export definition
+      if (Tok.K == KwExportAs) {
+        read();
+        if (Tok.K == Eof)
+          return createError(
+              "unexpected end of file, EXPORTAS identifier expected");
+        E.ExportAs = std::string(Tok.Value);
+      } else {
+        unget();
+      }
       Info.Exports.push_back(E);
       return Error::success();
     }
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 38a9e0e99b6b..01949c6dad81 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -514,6 +514,16 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx1200";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201:
     return "gfx1201";
+
+  // Generic AMDGCN targets
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:
+    return "gfx9-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:
+    return "gfx10.1-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:
+    return "gfx10.3-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:
+    return "gfx11-generic";
   default:
     llvm_unreachable("Unknown EF_AMDGPU_MACH value");
   }
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 22d604b125c5..4ab6536dc90b 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -355,6 +355,10 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
   if (LHS.first != RHS.first)
     return false;
 
+  // If the architecture is "all" we assume it is always compatible.
+  if (LHS.second.equals("generic") || RHS.second.equals("generic"))
+    return true;
+
   // Only The AMDGPU target requires additional checks.
   llvm::Triple T(LHS.first);
   if (!T.isAMDGPU())
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 8c1bbe9c2645..04e2b809c014 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -508,10 +508,17 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
   llvm::DenseSet<uint64_t> SeenGlobals;
   llvm::DenseSet<uint64_t> SeenSegments;
 
+  // If there is symbol info from the export section, this info will supersede
+  // it, but not info from a linking section
+  if (!HasLinkingSection) {
+    Symbols.clear();
+  }
+
   while (Ctx.Ptr < Ctx.End) {
     uint8_t Type = readUint8(Ctx);
     uint32_t Size = readVaruint32(Ctx);
     const uint8_t *SubSectionEnd = Ctx.Ptr + Size;
+
     switch (Type) {
     case wasm::WASM_NAMES_FUNCTION:
     case wasm::WASM_NAMES_GLOBAL:
@@ -521,6 +528,16 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
         uint32_t Index = readVaruint32(Ctx);
         StringRef Name = readString(Ctx);
         wasm::NameType nameType = wasm::NameType::FUNCTION;
+        wasm::WasmSymbolInfo Info{Name,
+                                  /*Kind */ wasm::WASM_SYMBOL_TYPE_FUNCTION,
+                                  /* Flags */ 0,
+                                  /* ImportModule */ std::nullopt,
+                                  /* ImportName */ std::nullopt,
+                                  /* ExportName */ std::nullopt,
+                                  {/* ElementIndex */ Index}};
+        const wasm::WasmSignature *Signature = nullptr;
+        const wasm::WasmGlobalType *GlobalType = nullptr;
+        const wasm::WasmTableType *TableType = nullptr;
         if (Type == wasm::WASM_NAMES_FUNCTION) {
           if (!SeenFunctions.insert(Index).second)
             return make_error<GenericBinaryError>(
@@ -529,26 +546,50 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
             return make_error<GenericBinaryError>("invalid function name entry",
                                                   object_error::parse_failed);
 
-          if (isDefinedFunctionIndex(Index))
-            getDefinedFunction(Index).DebugName = Name;
+          if (isDefinedFunctionIndex(Index)) {
+            wasm::WasmFunction &F = getDefinedFunction(Index);
+            F.DebugName = Name;
+            Signature = &Signatures[F.SigIndex];
+            if (F.ExportName) {
+              Info.ExportName = F.ExportName;
+              Info.Flags |= wasm::WASM_SYMBOL_BINDING_GLOBAL;
+            } else {
+              Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
+            }
+          } else {
+            Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED;
+          }
         } else if (Type == wasm::WASM_NAMES_GLOBAL) {
-          nameType = wasm::NameType::GLOBAL;
           if (!SeenGlobals.insert(Index).second)
             return make_error<GenericBinaryError>("global named more than once",
                                                   object_error::parse_failed);
           if (!isValidGlobalIndex(Index) || Name.empty())
             return make_error<GenericBinaryError>("invalid global name entry",
                                                   object_error::parse_failed);
+          nameType = wasm::NameType::GLOBAL;
+          Info.Kind = wasm::WASM_SYMBOL_TYPE_GLOBAL;
+          if (isDefinedGlobalIndex(Index)) {
+            GlobalType = &getDefinedGlobal(Index).Type;
+          } else {
+            Info.Flags |= wasm::WASM_SYMBOL_UNDEFINED;
+          }
         } else {
-          nameType = wasm::NameType::DATA_SEGMENT;
           if (!SeenSegments.insert(Index).second)
             return make_error<GenericBinaryError>(
                 "segment named more than once", object_error::parse_failed);
           if (Index > DataSegments.size())
             return make_error<GenericBinaryError>("invalid data segment name entry",
                                                   object_error::parse_failed);
+          nameType = wasm::NameType::DATA_SEGMENT;
+          Info.Kind = wasm::WASM_SYMBOL_TYPE_DATA;
+          Info.Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
+          assert(Index < DataSegments.size());
+          Info.DataRef = wasm::WasmDataReference{
+              Index, 0, DataSegments[Index].Data.Content.size()};
         }
         DebugNames.push_back(wasm::WasmDebugName{nameType, Index, Name});
+        if (!HasLinkingSection)
+          Symbols.emplace_back(Info, GlobalType, TableType, Signature);
       }
       break;
     }
@@ -993,6 +1034,13 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     if (Reloc.Offset < PreviousOffset)
       return make_error<GenericBinaryError>("relocations not in offset order",
                                             object_error::parse_failed);
+
+    auto badReloc = [&](StringRef msg) {
+      return make_error<GenericBinaryError>(
+          msg + ": " + Twine(Symbols[Reloc.Index].Info.Name),
+          object_error::parse_failed);
+    };
+
     PreviousOffset = Reloc.Offset;
     Reloc.Index = readVaruint32(Ctx);
     switch (type) {
@@ -1005,18 +1053,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB64:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       break;
     case wasm::R_WASM_TABLE_NUMBER_LEB:
       if (!isValidTableSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation table index",
-                                              object_error::parse_failed);
+        return badReloc("invalid table relocation");
       break;
     case wasm::R_WASM_TYPE_INDEX_LEB:
       if (Reloc.Index >= Signatures.size())
-        return make_error<GenericBinaryError>("invalid relocation type index",
-                                              object_error::parse_failed);
+        return badReloc("invalid relocation type index");
       break;
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
       // R_WASM_GLOBAL_INDEX_LEB are can be used against function and data
@@ -1024,18 +1069,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
       if (!isValidGlobalSymbol(Reloc.Index) &&
           !isValidDataSymbol(Reloc.Index) &&
           !isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation global index",
-                                              object_error::parse_failed);
+        return badReloc("invalid global relocation");
       break;
     case wasm::R_WASM_GLOBAL_INDEX_I32:
       if (!isValidGlobalSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation global index",
-                                              object_error::parse_failed);
+        return badReloc("invalid global relocation");
       break;
     case wasm::R_WASM_TAG_INDEX_LEB:
       if (!isValidTagSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation tag index",
-                                              object_error::parse_failed);
+        return badReloc("invalid tag relocation");
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
@@ -1044,8 +1086,7 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_LOCREL_I32:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation data index",
-                                              object_error::parse_failed);
+        return badReloc("invalid data relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     case wasm::R_WASM_MEMORY_ADDR_LEB64:
@@ -1054,26 +1095,22 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64:
       if (!isValidDataSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>("invalid relocation data index",
-                                              object_error::parse_failed);
+        return badReloc("invalid data relocation");
       Reloc.Addend = readVarint64(Ctx);
       break;
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     case wasm::R_WASM_FUNCTION_OFFSET_I64:
       if (!isValidFunctionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation function index", object_error::parse_failed);
+        return badReloc("invalid function relocation");
       Reloc.Addend = readVarint64(Ctx);
       break;
     case wasm::R_WASM_SECTION_OFFSET_I32:
       if (!isValidSectionSymbol(Reloc.Index))
-        return make_error<GenericBinaryError>(
-            "invalid relocation section index", object_error::parse_failed);
+        return badReloc("invalid section relocation");
       Reloc.Addend = readVarint32(Ctx);
       break;
     default:
@@ -1895,6 +1932,20 @@ uint32_t WasmObjectFile::getSymbolSectionIdImpl(const WasmSymbol &Sym) const {
   }
 }
 
+uint32_t WasmObjectFile::getSymbolSize(SymbolRef Symb) const {
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  if (!Sym.isDefined())
+    return 0;
+  if (Sym.isTypeData())
+    return Sym.Info.DataRef.Size;
+  if (Sym.isTypeFunction())
+    return functions()[Sym.Info.ElementIndex - getNumImportedFunctions()].Size;
+  // Currently symbol size is only tracked for data segments and functions. In
+  // principle we could also track size (e.g. binary size) for tables, globals
+  // and element segments etc too.
+  return 0;
+}
+
 void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 Expected<StringRef> WasmObjectFile::getSectionName(DataRefImpl Sec) const {
diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 99689786a13c..f4ca1b22eafa 100644
--- a/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -259,6 +259,7 @@ void ScalarEnumerationTraits<CallingConvention>::enumeration(
   IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall);
   IO.enumCase(Value, "Inline", CallingConvention::Inline);
   IO.enumCase(Value, "NearVector", CallingConvention::NearVector);
+  IO.enumCase(Value, "Swift", CallingConvention::Swift);
 }
 
 void ScalarEnumerationTraits<PointerKind>::enumeration(IO &IO,
diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index a26e93f65ed7..39195bf63f2f 100644
--- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -691,6 +691,194 @@ Error DWARFYAML::emitDebugStrOffsets(raw_ostream &OS, const Data &DI) {
   return Error::success();
 }
 
+namespace {
+/// Emits the header for a DebugNames section.
+void emitDebugNamesHeader(raw_ostream &OS, bool IsLittleEndian,
+                          uint32_t NameCount, uint32_t AbbrevSize,
+                          uint32_t CombinedSizeOtherParts) {
+  // Use the same AugmentationString as AsmPrinter.
+  StringRef AugmentationString = "LLVM0700";
+  size_t TotalSize = CombinedSizeOtherParts + 5 * sizeof(uint32_t) +
+                     2 * sizeof(uint16_t) + sizeof(NameCount) +
+                     sizeof(AbbrevSize) + AugmentationString.size();
+  writeInteger(uint32_t(TotalSize), OS, IsLittleEndian); // Unit length
+
+  // Everything below is included in total size.
+  writeInteger(uint16_t(5), OS, IsLittleEndian); // Version
+  writeInteger(uint16_t(0), OS, IsLittleEndian); // Padding
+  writeInteger(uint32_t(1), OS, IsLittleEndian); // Compilation Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Local Type Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Foreign Type Unit count
+  writeInteger(uint32_t(0), OS, IsLittleEndian); // Bucket count
+  writeInteger(NameCount, OS, IsLittleEndian);
+  writeInteger(AbbrevSize, OS, IsLittleEndian);
+  writeInteger(uint32_t(AugmentationString.size()), OS, IsLittleEndian);
+  OS.write(AugmentationString.data(), AugmentationString.size());
+  return;
+}
+
+/// Emits the abbreviations for a DebugNames section.
+std::string
+emitDebugNamesAbbrev(ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  std::string Data;
+  raw_string_ostream OS(Data);
+  for (const DWARFYAML::DebugNameAbbreviation &Abbrev : Abbrevs) {
+    encodeULEB128(Abbrev.Code, OS);
+    encodeULEB128(Abbrev.Tag, OS);
+    for (auto [Idx, Form] : Abbrev.Indices) {
+      encodeULEB128(Idx, OS);
+      encodeULEB128(Form, OS);
+    }
+    encodeULEB128(0, OS);
+    encodeULEB128(0, OS);
+  }
+  encodeULEB128(0, OS);
+  return Data;
+}
+
+/// Emits a simple CU offsets list for a DebugNames section containing a single
+/// CU at offset 0.
+std::string emitDebugNamesCUOffsets(bool IsLittleEndian) {
+  std::string Data;
+  raw_string_ostream OS(Data);
+  writeInteger(uint32_t(0), OS, IsLittleEndian);
+  return Data;
+}
+
+/// Emits the "NameTable" for a DebugNames section; according to the spec, it
+/// consists of two arrays: an array of string offsets, followed immediately by
+/// an array of entry offsets. The string offsets are emitted in the order
+/// provided in `Entries`.
+std::string emitDebugNamesNameTable(
+    bool IsLittleEndian,
+    const DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>> &Entries,
+    ArrayRef<uint32_t> EntryPoolOffsets) {
+  assert(Entries.size() == EntryPoolOffsets.size());
+
+  std::string Data;
+  raw_string_ostream OS(Data);
+
+  for (uint32_t Strp : make_first_range(Entries))
+    writeInteger(Strp, OS, IsLittleEndian);
+  for (uint32_t PoolOffset : EntryPoolOffsets)
+    writeInteger(PoolOffset, OS, IsLittleEndian);
+  return Data;
+}
+
+/// Groups entries based on their name (strp) code and returns a map.
+DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>>
+groupEntries(ArrayRef<DWARFYAML::DebugNameEntry> Entries) {
+  DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>> StrpToEntries;
+  for (const DWARFYAML::DebugNameEntry &Entry : Entries)
+    StrpToEntries[Entry.NameStrp].push_back(Entry);
+  return StrpToEntries;
+}
+
+/// Finds the abbreviation whose code is AbbrevCode and returns a list
+/// containing the expected size of all non-zero-length forms.
+Expected<SmallVector<uint8_t>>
+getNonZeroDataSizesFor(uint32_t AbbrevCode,
+                       ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  const auto *AbbrevIt = find_if(Abbrevs, [&](const auto &Abbrev) {
+    return Abbrev.Code.value == AbbrevCode;
+  });
+  if (AbbrevIt == Abbrevs.end())
+    return createStringError(inconvertibleErrorCode(),
+                             "did not find an Abbreviation for this code");
+
+  SmallVector<uint8_t> DataSizes;
+  dwarf::FormParams Params{/*Version=*/5, /*AddrSize=*/4, dwarf::DWARF32};
+  for (auto [Idx, Form] : AbbrevIt->Indices) {
+    std::optional<uint8_t> FormSize = dwarf::getFixedFormByteSize(Form, Params);
+    if (!FormSize)
+      return createStringError(inconvertibleErrorCode(),
+                               "unsupported Form for YAML debug_names emitter");
+    if (FormSize == 0)
+      continue;
+    DataSizes.push_back(*FormSize);
+  }
+  return DataSizes;
+}
+
+struct PoolOffsetsAndData {
+  std::string PoolData;
+  std::vector<uint32_t> PoolOffsets;
+};
+
+/// Emits the entry pool and returns an array of offsets containing the start
+/// offset for the entries of each unique name.
+/// Verifies that the provided number of data values match those expected by
+/// the abbreviation table.
+Expected<PoolOffsetsAndData> emitDebugNamesEntryPool(
+    bool IsLittleEndian,
+    const DenseMap<uint32_t, std::vector<DWARFYAML::DebugNameEntry>>
+        &StrpToEntries,
+    ArrayRef<DWARFYAML::DebugNameAbbreviation> Abbrevs) {
+  PoolOffsetsAndData Result;
+  raw_string_ostream OS(Result.PoolData);
+
+  for (ArrayRef<DWARFYAML::DebugNameEntry> EntriesWithSameName :
+       make_second_range(StrpToEntries)) {
+    Result.PoolOffsets.push_back(Result.PoolData.size());
+
+    for (const DWARFYAML::DebugNameEntry &Entry : EntriesWithSameName) {
+      encodeULEB128(Entry.Code, OS);
+
+      Expected<SmallVector<uint8_t>> DataSizes =
+          getNonZeroDataSizesFor(Entry.Code, Abbrevs);
+      if (!DataSizes)
+        return DataSizes.takeError();
+      if (DataSizes->size() != Entry.Values.size())
+        return createStringError(
+            inconvertibleErrorCode(),
+            "mismatch between provided and required number of values");
+
+      for (auto [Value, ValueSize] : zip_equal(Entry.Values, *DataSizes))
+        if (Error E =
+                writeVariableSizedInteger(Value, ValueSize, OS, IsLittleEndian))
+          return std::move(E);
+    }
+    encodeULEB128(0, OS);
+  }
+
+  return Result;
+}
+} // namespace
+
+Error DWARFYAML::emitDebugNames(raw_ostream &OS, const Data &DI) {
+  assert(DI.DebugNames && "unexpected emitDebugNames() call");
+  const DebugNamesSection DebugNames = DI.DebugNames.value();
+
+  DenseMap<uint32_t, std::vector<DebugNameEntry>> StrpToEntries =
+      groupEntries(DebugNames.Entries);
+
+  // Emit all sub-sections into individual strings so that we may compute
+  // relative offsets and sizes.
+  Expected<PoolOffsetsAndData> PoolInfo = emitDebugNamesEntryPool(
+      DI.IsLittleEndian, StrpToEntries, DebugNames.Abbrevs);
+  if (!PoolInfo)
+    return PoolInfo.takeError();
+  std::string NamesTableData = emitDebugNamesNameTable(
+      DI.IsLittleEndian, StrpToEntries, PoolInfo->PoolOffsets);
+
+  std::string AbbrevData = emitDebugNamesAbbrev(DebugNames.Abbrevs);
+  std::string CUOffsetsData = emitDebugNamesCUOffsets(DI.IsLittleEndian);
+
+  size_t TotalSize = PoolInfo->PoolData.size() + NamesTableData.size() +
+                     AbbrevData.size() + CUOffsetsData.size();
+
+  // Start real emission by combining all individual strings.
+  emitDebugNamesHeader(OS, DI.IsLittleEndian, StrpToEntries.size(),
+                       AbbrevData.size(), TotalSize);
+  OS.write(CUOffsetsData.data(), CUOffsetsData.size());
+  // No local TUs, no foreign TUs, no hash lookups table.
+  OS.write(NamesTableData.data(), NamesTableData.size());
+  OS.write(AbbrevData.data(), AbbrevData.size());
+  OS.write(PoolInfo->PoolData.data(), PoolInfo->PoolData.size());
+
+  return Error::success();
+}
+
 static Error checkOperandCount(StringRef EncodingString,
                                ArrayRef<yaml::Hex64> Values,
                                uint64_t ExpectedOperands) {
@@ -1024,6 +1212,7 @@ DWARFYAML::getDWARFEmitterByName(StringRef SecName) {
           .Case("debug_rnglists", DWARFYAML::emitDebugRnglists)
           .Case("debug_str", DWARFYAML::emitDebugStr)
           .Case("debug_str_offsets", DWARFYAML::emitDebugStrOffsets)
+          .Case("debug_names", DWARFYAML::emitDebugNames)
           .Default([&](raw_ostream &, const DWARFYAML::Data &) {
             return createStringError(errc::not_supported,
                                      SecName + " is not supported");
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 2bddeed46413..5207671f57d9 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -52,6 +52,8 @@ SetVector<StringRef> DWARFYAML::Data::getNonEmptySectionNames() const {
     SecNames.insert("debug_rnglists");
   if (DebugLoclists)
     SecNames.insert("debug_loclists");
+  if (DebugNames)
+    SecNames.insert("debug_names");
   return SecNames;
 }
 
@@ -105,6 +107,7 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   IO.mapOptional("debug_str_offsets", DWARF.DebugStrOffsets);
   IO.mapOptional("debug_rnglists", DWARF.DebugRnglists);
   IO.mapOptional("debug_loclists", DWARF.DebugLoclists);
+  IO.mapOptional("debug_names", DWARF.DebugNames);
   IO.setContext(OldContext);
 }
 
@@ -122,6 +125,32 @@ void MappingTraits<DWARFYAML::Abbrev>::mapping(IO &IO,
   IO.mapOptional("Attributes", Abbrev.Attributes);
 }
 
+void MappingTraits<DWARFYAML::IdxForm>::mapping(IO &IO,
+                                                DWARFYAML::IdxForm &IdxForm) {
+  IO.mapRequired("Idx", IdxForm.Idx);
+  IO.mapRequired("Form", IdxForm.Form);
+}
+
+void MappingTraits<DWARFYAML::DebugNameAbbreviation>::mapping(
+    IO &IO, DWARFYAML::DebugNameAbbreviation &DebugNameAbbreviation) {
+  IO.mapRequired("Code", DebugNameAbbreviation.Code);
+  IO.mapRequired("Tag", DebugNameAbbreviation.Tag);
+  IO.mapRequired("Indices", DebugNameAbbreviation.Indices);
+}
+
+void MappingTraits<DWARFYAML::DebugNameEntry>::mapping(
+    IO &IO, DWARFYAML::DebugNameEntry &DebugNameEntry) {
+  IO.mapRequired("Name", DebugNameEntry.NameStrp);
+  IO.mapRequired("Code", DebugNameEntry.Code);
+  IO.mapOptional("Values", DebugNameEntry.Values);
+}
+
+void MappingTraits<DWARFYAML::DebugNamesSection>::mapping(
+    IO &IO, DWARFYAML::DebugNamesSection &DebugNames) {
+  IO.mapRequired("Abbreviations", DebugNames.Abbrevs);
+  IO.mapRequired("Entries", DebugNames.Entries);
+}
+
 void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
     IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
   IO.mapRequired("Attribute", AttAbbrev.Attribute);
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 1436e920c011..de1ef2458152 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -612,6 +612,10 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, EF_AMDGPU_MACH);
     switch (Object->Header.ABIVersion) {
     default:
       // ELFOSABI_AMDGPU_PAL, ELFOSABI_AMDGPU_MESA3D support *_V3 flags.
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index ccf768c06aeb..f68c571da9fa 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 namespace {
 
@@ -40,30 +41,32 @@ public:
   bool writeXCOFF();
 
 private:
+  void reportOverwrite(uint64_t currentOffset, uint64_t specifiedOffset,
+                       const Twine &fieldName);
   bool nameShouldBeInStringTable(StringRef SymbolName);
   bool initFileHeader(uint64_t CurrentOffset);
   void initAuxFileHeader();
-  bool initSectionHeader(uint64_t &CurrentOffset);
+  bool initSectionHeaders(uint64_t &CurrentOffset);
   bool initRelocations(uint64_t &CurrentOffset);
   bool initStringTable();
   bool assignAddressesAndIndices();
 
   void writeFileHeader();
   void writeAuxFileHeader();
-  void writeSectionHeader();
+  void writeSectionHeaders();
   bool writeSectionData();
   bool writeRelocations();
   bool writeSymbols();
   void writeStringTable();
 
-  void writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym);
-  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym);
-  void writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym);
+  bool writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym);
+  bool writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym);
 
   XCOFFYAML::Object &Obj;
   bool Is64Bit = false;
@@ -90,6 +93,14 @@ static void writeName(StringRef StrName, support::endian::Writer W) {
   W.write(NameRef);
 }
 
+void XCOFFWriter::reportOverwrite(uint64_t CurrentOffset,
+                                  uint64_t specifiedOffset,
+                                  const Twine &fieldName) {
+  ErrHandler("current file offset (" + Twine(CurrentOffset) +
+             ") is bigger than the specified " + fieldName + " (" +
+             Twine(specifiedOffset) + ") ");
+}
+
 bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
   // For XCOFF64: The symbol name is always in the string table.
   return (SymbolName.size() > XCOFF::NameSize) || Is64Bit;
@@ -98,14 +109,31 @@ bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
 bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
   for (XCOFFYAML::Section &InitSection : InitSections) {
     if (!InitSection.Relocations.empty()) {
-      InitSection.NumberOfRelocations = InitSection.Relocations.size();
-      InitSection.FileOffsetToRelocations = CurrentOffset;
       uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64
                                  : XCOFF::RelocationSerializationSize32;
-      CurrentOffset += InitSection.NumberOfRelocations * RelSize;
+      uint64_t UsedSize = RelSize * InitSection.Relocations.size();
+
+      // If NumberOfRelocations was specified, we use it, even if it's
+      // not consistent with the number of provided relocations.
+      if (!InitSection.NumberOfRelocations)
+        InitSection.NumberOfRelocations = InitSection.Relocations.size();
+
+      // If the YAML file specified an offset to relocations, we use it.
+      if (InitSection.FileOffsetToRelocations) {
+        if (CurrentOffset > InitSection.FileOffsetToRelocations) {
+          reportOverwrite(CurrentOffset, InitSection.FileOffsetToRelocations,
+                          "FileOffsetToRelocations for the " +
+                              InitSection.SectionName + " section");
+          return false;
+        }
+        CurrentOffset = InitSection.FileOffsetToRelocations;
+      } else
+        InitSection.FileOffsetToRelocations = CurrentOffset;
+      CurrentOffset += UsedSize;
       if (CurrentOffset > MaxRawDataSize) {
-        ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                   "exceeded when writing relocation data");
+        ErrHandler("maximum object size (" + Twine(MaxRawDataSize) +
+                   ") exceeded when writing relocation data for section " +
+                   Twine(InitSection.SectionName));
         return false;
       }
     }
@@ -113,15 +141,10 @@ bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
   return true;
 }
 
-bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) {
-  uint64_t CurrentSecAddr = 0;
+bool XCOFFWriter::initSectionHeaders(uint64_t &CurrentOffset) {
+  uint64_t CurrentEndDataAddr = 0;
+  uint64_t CurrentEndTDataAddr = 0;
   for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) {
-    if (CurrentOffset > MaxRawDataSize) {
-      ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                 "exceeded when writing section data");
-      return false;
-    }
-
     // Assign indices for sections.
     if (InitSections[I].SectionName.size() &&
         !SectionIndexMap[InitSections[I].SectionName]) {
@@ -134,23 +157,58 @@ bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) {
       }
     }
 
-    // Calculate the physical/virtual address. This field should contain 0 for
-    // all sections except the text, data and bss sections.
-    if (InitSections[I].Flags != XCOFF::STYP_TEXT &&
-        InitSections[I].Flags != XCOFF::STYP_DATA &&
-        InitSections[I].Flags != XCOFF::STYP_BSS)
-      InitSections[I].Address = 0;
-    else
-      InitSections[I].Address = CurrentSecAddr;
+    if (!InitSections[I].Size)
+      InitSections[I].Size = InitSections[I].SectionData.binary_size();
+
+    // Section data addresses (physical/virtual) are related to symbol
+    // addresses and alignments. Furthermore, it is possible to specify the
+    // same starting addresses for the .text, .data, and .tdata sections.
+    // Without examining all the symbols and their addreses and alignments,
+    // it is not possible to compute valid section addresses. The only
+    // condition required by XCOFF is that the .bss section immediately
+    // follows the .data section, and the .tbss section immediately follows
+    // the .tdata section. Therefore, we only assign addresses to the .bss
+    // and .tbss sections if they do not already have non-zero addresses.
+    // (If the YAML file is being used to generate a valid object file, we
+    // expect all section addresses to be specified explicitly.)
+    switch (InitSections[I].Flags) {
+    case XCOFF::STYP_DATA:
+      CurrentEndDataAddr = InitSections[I].Address + InitSections[I].Size;
+      break;
+    case XCOFF::STYP_BSS:
+      if (!InitSections[I].Address)
+        InitSections[I].Address = CurrentEndDataAddr;
+      break;
+    case XCOFF::STYP_TDATA:
+      CurrentEndTDataAddr = InitSections[I].Address + InitSections[I].Size;
+      break;
+    case XCOFF::STYP_TBSS:
+      if (!InitSections[I].Address)
+        InitSections[I].Address = CurrentEndTDataAddr;
+      break;
+    }
 
-    // Calculate the FileOffsetToData and data size for sections.
     if (InitSections[I].SectionData.binary_size()) {
-      InitSections[I].FileOffsetToData = CurrentOffset;
+      if (InitSections[I].FileOffsetToData) {
+        // Use the providedFileOffsetToData.
+        if (CurrentOffset > InitSections[I].FileOffsetToData) {
+          reportOverwrite(CurrentOffset, InitSections[I].FileOffsetToData,
+                          "FileOffsetToData for the " +
+                              InitSections[I].SectionName + " section");
+          return false;
+        }
+        CurrentOffset = InitSections[I].FileOffsetToData;
+      } else {
+        CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign);
+        InitSections[I].FileOffsetToData = CurrentOffset;
+      }
       CurrentOffset += InitSections[I].SectionData.binary_size();
-      // Ensure the offset is aligned to DefaultSectionAlign.
-      CurrentOffset = alignTo(CurrentOffset, DefaultSectionAlign);
-      InitSections[I].Size = CurrentOffset - InitSections[I].FileOffsetToData;
-      CurrentSecAddr += InitSections[I].Size;
+      if (CurrentOffset > MaxRawDataSize) {
+        ErrHandler("maximum object size (" + Twine(MaxRawDataSize) +
+                   ") exceeded when writing data for section " + Twine(I + 1) +
+                   " (" + Twine(InitSections[I].SectionName) + ")");
+        return false;
+      }
     }
   }
   return initRelocations(CurrentOffset);
@@ -181,7 +239,7 @@ bool XCOFFWriter::initStringTable() {
   StrTblBuilder.clear();
 
   if (Obj.StrTbl.Strings) {
-    // All specified strings should be added to the string table.
+    // Add all specified strings to the string table.
     for (StringRef StringEnt : *Obj.StrTbl.Strings)
       StrTblBuilder.add(StringEnt);
 
@@ -254,12 +312,20 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
 
   // Calculate SymbolTableOffset for the file header.
   if (InitFileHdr.NumberOfSymTableEntries) {
+    if (Obj.Header.SymbolTableOffset) {
+      if (CurrentOffset > Obj.Header.SymbolTableOffset) {
+        reportOverwrite(CurrentOffset, Obj.Header.SymbolTableOffset,
+                        "SymbolTableOffset");
+        return false;
+      }
+      CurrentOffset = Obj.Header.SymbolTableOffset;
+    }
     InitFileHdr.SymbolTableOffset = CurrentOffset;
     CurrentOffset +=
         InitFileHdr.NumberOfSymTableEntries * XCOFF::SymbolTableEntrySize;
     if (CurrentOffset > MaxRawDataSize) {
-      ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
-                 "exceeded when writing symbols");
+      ErrHandler("maximum object size of " + Twine(MaxRawDataSize) +
+                 " exceeded when writing symbols");
       return false;
     }
   }
@@ -268,7 +334,8 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
 }
 
 void XCOFFWriter::initAuxFileHeader() {
-  InitAuxFileHdr = *Obj.AuxHeader;
+  if (Obj.AuxHeader)
+    InitAuxFileHdr = *Obj.AuxHeader;
   // In general, an object file might contain multiple sections of a given type,
   // but in a loadable module, there must be exactly one .text, .data, .bss, and
   // .loader section. A loadable object might also have one .tdata section and
@@ -322,28 +389,32 @@ void XCOFFWriter::initAuxFileHeader() {
 bool XCOFFWriter::assignAddressesAndIndices() {
   uint64_t FileHdrSize =
       Is64Bit ? XCOFF::FileHeaderSize64 : XCOFF::FileHeaderSize32;
+
+  // If AuxHeaderSize is specified in the YAML file, we construct
+  // an auxiliary header.
   uint64_t AuxFileHdrSize = 0;
-  if (Obj.AuxHeader)
-    AuxFileHdrSize = Obj.Header.AuxHeaderSize
-                         ? Obj.Header.AuxHeaderSize
-                         : (Is64Bit ? XCOFF::AuxFileHeaderSize64
-                                    : XCOFF::AuxFileHeaderSize32);
+
+  if (Obj.Header.AuxHeaderSize)
+    AuxFileHdrSize = Obj.Header.AuxHeaderSize;
+  else if (Obj.AuxHeader)
+    AuxFileHdrSize =
+        (Is64Bit ? XCOFF::AuxFileHeaderSize64 : XCOFF::AuxFileHeaderSize32);
   uint64_t SecHdrSize =
       Is64Bit ? XCOFF::SectionHeaderSize64 : XCOFF::SectionHeaderSize32;
   uint64_t CurrentOffset =
       FileHdrSize + AuxFileHdrSize + InitSections.size() * SecHdrSize;
 
   // Calculate section header info.
-  if (!initSectionHeader(CurrentOffset))
+  if (!initSectionHeaders(CurrentOffset))
     return false;
-  InitFileHdr.AuxHeaderSize = AuxFileHdrSize;
 
   // Calculate file header info.
   if (!initFileHeader(CurrentOffset))
     return false;
+  InitFileHdr.AuxHeaderSize = AuxFileHdrSize;
 
   // Initialize the auxiliary file header.
-  if (Obj.AuxHeader)
+  if (AuxFileHdrSize)
     initAuxFileHeader();
 
   // Initialize the string table.
@@ -356,18 +427,14 @@ void XCOFFWriter::writeFileHeader() {
                                                 : InitFileHdr.NumberOfSections);
   W.write<int32_t>(Obj.Header.TimeStamp);
   if (Is64Bit) {
-    W.write<uint64_t>(Obj.Header.SymbolTableOffset
-                          ? Obj.Header.SymbolTableOffset
-                          : InitFileHdr.SymbolTableOffset);
+    W.write<uint64_t>(InitFileHdr.SymbolTableOffset);
     W.write<uint16_t>(InitFileHdr.AuxHeaderSize);
     W.write<uint16_t>(Obj.Header.Flags);
     W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
                          ? Obj.Header.NumberOfSymTableEntries
                          : InitFileHdr.NumberOfSymTableEntries);
   } else {
-    W.write<uint32_t>(Obj.Header.SymbolTableOffset
-                          ? Obj.Header.SymbolTableOffset
-                          : InitFileHdr.SymbolTableOffset);
+    W.write<uint32_t>(InitFileHdr.SymbolTableOffset);
     W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
                          ? Obj.Header.NumberOfSymTableEntries
                          : InitFileHdr.NumberOfSymTableEntries);
@@ -391,6 +458,9 @@ void XCOFFWriter::writeAuxFileHeader() {
     W.write<uint32_t>(InitAuxFileHdr.EntryPointAddr.value_or(yaml::Hex64(0)));
     W.write<uint32_t>(InitAuxFileHdr.TextStartAddr.value_or(yaml::Hex64(0)));
     W.write<uint32_t>(InitAuxFileHdr.DataStartAddr.value_or(yaml::Hex64(0)));
+    // A short 32-bit auxiliary header ends here.
+    if (InitFileHdr.AuxHeaderSize == XCOFF::AuxFileHeaderSizeShort)
+      return;
     W.write<uint32_t>(InitAuxFileHdr.TOCAnchorAddr.value_or(yaml::Hex64(0)));
   }
   W.write<uint16_t>(InitAuxFileHdr.SecNumOfEntryPoint.value_or(0));
@@ -433,50 +503,39 @@ void XCOFFWriter::writeAuxFileHeader() {
         InitAuxFileHdr.Flag.value_or(yaml::Hex16(XCOFF::SHR_SYMTAB)));
     if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize64)
       W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize64);
-  } else if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) {
-    W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32);
+  } else {
+    if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32)
+      W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32);
   }
 }
 
-void XCOFFWriter::writeSectionHeader() {
+void XCOFFWriter::writeSectionHeaders() {
   for (uint16_t I = 0, E = Obj.Sections.size(); I < E; ++I) {
-    XCOFFYAML::Section YamlSec = Obj.Sections[I];
     XCOFFYAML::Section DerivedSec = InitSections[I];
-    writeName(YamlSec.SectionName, W);
-    // Virtual address is the same as physical address.
-    uint64_t SectionAddress =
-        YamlSec.Address ? YamlSec.Address : DerivedSec.Address;
+    writeName(DerivedSec.SectionName, W);
     if (Is64Bit) {
-      W.write<uint64_t>(SectionAddress); // Physical address
-      W.write<uint64_t>(SectionAddress); // Virtual address
-      W.write<uint64_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
-      W.write<uint64_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
-                                                 : DerivedSec.FileOffsetToData);
-      W.write<uint64_t>(YamlSec.FileOffsetToRelocations
-                            ? YamlSec.FileOffsetToRelocations
-                            : DerivedSec.FileOffsetToRelocations);
-      W.write<uint64_t>(YamlSec.FileOffsetToLineNumbers);
-      W.write<uint32_t>(YamlSec.NumberOfRelocations
-                            ? YamlSec.NumberOfRelocations
-                            : DerivedSec.NumberOfRelocations);
-      W.write<uint32_t>(YamlSec.NumberOfLineNumbers);
-      W.write<int32_t>(YamlSec.Flags);
+      // Virtual address is the same as physical address.
+      W.write<uint64_t>(DerivedSec.Address); // Physical address
+      W.write<uint64_t>(DerivedSec.Address); // Virtual address
+      W.write<uint64_t>(DerivedSec.Size);
+      W.write<uint64_t>(DerivedSec.FileOffsetToData);
+      W.write<uint64_t>(DerivedSec.FileOffsetToRelocations);
+      W.write<uint64_t>(DerivedSec.FileOffsetToLineNumbers);
+      W.write<uint32_t>(DerivedSec.NumberOfRelocations);
+      W.write<uint32_t>(DerivedSec.NumberOfLineNumbers);
+      W.write<int32_t>(DerivedSec.Flags);
       W.OS.write_zeros(4);
     } else {
-      W.write<uint32_t>(SectionAddress); // Physical address
-      W.write<uint32_t>(SectionAddress); // Virtual address
-      W.write<uint32_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
-      W.write<uint32_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
-                                                 : DerivedSec.FileOffsetToData);
-      W.write<uint32_t>(YamlSec.FileOffsetToRelocations
-                            ? YamlSec.FileOffsetToRelocations
-                            : DerivedSec.FileOffsetToRelocations);
-      W.write<uint32_t>(YamlSec.FileOffsetToLineNumbers);
-      W.write<uint16_t>(YamlSec.NumberOfRelocations
-                            ? YamlSec.NumberOfRelocations
-                            : DerivedSec.NumberOfRelocations);
-      W.write<uint16_t>(YamlSec.NumberOfLineNumbers);
-      W.write<int32_t>(YamlSec.Flags);
+      // Virtual address is the same as physical address.
+      W.write<uint32_t>(DerivedSec.Address); // Physical address
+      W.write<uint32_t>(DerivedSec.Address); // Virtual address
+      W.write<uint32_t>(DerivedSec.Size);
+      W.write<uint32_t>(DerivedSec.FileOffsetToData);
+      W.write<uint32_t>(DerivedSec.FileOffsetToRelocations);
+      W.write<uint32_t>(DerivedSec.FileOffsetToLineNumbers);
+      W.write<uint16_t>(DerivedSec.NumberOfRelocations);
+      W.write<uint16_t>(DerivedSec.NumberOfLineNumbers);
+      W.write<int32_t>(DerivedSec.Flags);
     }
   }
 }
@@ -486,8 +545,8 @@ bool XCOFFWriter::writeSectionData() {
     XCOFFYAML::Section YamlSec = Obj.Sections[I];
     if (YamlSec.SectionData.binary_size()) {
       // Fill the padding size with zeros.
-      int64_t PaddingSize =
-          InitSections[I].FileOffsetToData - (W.OS.tell() - StartOffset);
+      int64_t PaddingSize = (uint64_t)InitSections[I].FileOffsetToData -
+                            (W.OS.tell() - StartOffset);
       if (PaddingSize < 0) {
         ErrHandler("redundant data was written before section data");
         return false;
@@ -524,12 +583,44 @@ bool XCOFFWriter::writeRelocations() {
   return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
+  uint8_t SymAlignAndType = 0;
+  if (AuxSym.SymbolAlignmentAndType) {
+    if (AuxSym.SymbolType || AuxSym.SymbolAlignment) {
+      ErrHandler("cannot specify SymbolType or SymbolAlignment if "
+                 "SymbolAlignmentAndType is specified");
+      return false;
+    }
+    SymAlignAndType = *AuxSym.SymbolAlignmentAndType;
+  } else {
+    if (AuxSym.SymbolType) {
+      uint8_t SymbolType = *AuxSym.SymbolType;
+      if (SymbolType & ~XCOFFCsectAuxRef::SymbolTypeMask) {
+        ErrHandler("symbol type must be less than " +
+                   Twine(1 + XCOFFCsectAuxRef::SymbolTypeMask));
+        return false;
+      }
+      SymAlignAndType = SymbolType;
+    }
+    if (AuxSym.SymbolAlignment) {
+      const uint8_t ShiftedSymbolAlignmentMask =
+          XCOFFCsectAuxRef::SymbolAlignmentMask >>
+          XCOFFCsectAuxRef::SymbolAlignmentBitOffset;
+
+      if (*AuxSym.SymbolAlignment & ~ShiftedSymbolAlignmentMask) {
+        ErrHandler("symbol alignment must be less than " +
+                   Twine(1 + ShiftedSymbolAlignmentMask));
+        return false;
+      }
+      SymAlignAndType |= (*AuxSym.SymbolAlignment
+                          << XCOFFCsectAuxRef::SymbolAlignmentBitOffset);
+    }
+  }
   if (Is64Bit) {
     W.write<uint32_t>(AuxSym.SectionOrLengthLo.value_or(0));
     W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
     W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(SymAlignAndType);
     W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
     W.write<uint32_t>(AuxSym.SectionOrLengthHi.value_or(0));
     W.write<uint8_t>(0);
@@ -538,23 +629,25 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
     W.write<uint32_t>(AuxSym.SectionOrLength.value_or(0));
     W.write<uint32_t>(AuxSym.ParameterHashIndex.value_or(0));
     W.write<uint16_t>(AuxSym.TypeChkSectNum.value_or(0));
-    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.value_or(0));
+    W.write<uint8_t>(SymAlignAndType);
     W.write<uint8_t>(AuxSym.StorageMappingClass.value_or(XCOFF::XMC_PR));
     W.write<uint32_t>(AuxSym.StabInfoIndex.value_or(0));
     W.write<uint16_t>(AuxSym.StabSectNum.value_or(0));
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
   assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32");
   W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.value_or(0));
   W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
   W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
   W.write<uint8_t>(0);
   W.write<uint8_t>(XCOFF::AUX_EXCEPT);
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
   if (Is64Bit) {
     W.write<uint64_t>(AuxSym.PtrToLineNum.value_or(0));
     W.write<uint32_t>(AuxSym.SizeOfFunction.value_or(0));
@@ -568,9 +661,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
     W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.value_or(0));
     W.OS.write_zeros(2);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
   StringRef FileName = AuxSym.FileNameOrString.value_or("");
   if (nameShouldBeInStringTable(FileName)) {
     W.write<int32_t>(0);
@@ -586,9 +680,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
   } else {
     W.OS.write_zeros(3);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
   if (Is64Bit) {
     W.write<uint32_t>(AuxSym.LineNum.value_or(0));
     W.OS.write_zeros(13);
@@ -599,9 +694,10 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
     W.write<uint16_t>(AuxSym.LineNumLo.value_or(0));
     W.OS.write_zeros(12);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
   if (Is64Bit) {
     W.write<uint64_t>(AuxSym.LengthOfSectionPortion.value_or(0));
     W.write<uint64_t>(AuxSym.NumberOfRelocEnt.value_or(0));
@@ -613,39 +709,41 @@ void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
     W.write<uint32_t>(AuxSym.NumberOfRelocEnt.value_or(0));
     W.OS.write_zeros(6);
   }
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
+bool XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
   assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64");
   W.write<uint32_t>(AuxSym.SectionLength.value_or(0));
   W.write<uint16_t>(AuxSym.NumberOfRelocEnt.value_or(0));
   W.write<uint16_t>(AuxSym.NumberOfLineNum.value_or(0));
   W.OS.write_zeros(10);
+  return true;
 }
 
-void XCOFFWriter::writeAuxSymbol(
+bool XCOFFWriter::writeAuxSymbol(
     const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) {
   if (auto AS = dyn_cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get()))
-    writeAuxSymbol(*AS);
+    return writeAuxSymbol(*AS);
   else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get()))
-    writeAuxSymbol(*AS);
-  else
-    llvm_unreachable("unknown auxiliary symbol type");
+    return writeAuxSymbol(*AS);
+  llvm_unreachable("unknown auxiliary symbol type");
+  return false;
 }
 
 bool XCOFFWriter::writeSymbols() {
   int64_t PaddingSize =
-      (uint64_t)InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset);
+      InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset);
   if (PaddingSize < 0) {
     ErrHandler("redundant data was written before symbols");
     return false;
@@ -698,7 +796,8 @@ bool XCOFFWriter::writeSymbols() {
     } else {
       for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym :
            YamlSym.AuxEntries) {
-        writeAuxSymbol(AuxSym);
+        if (!writeAuxSymbol(AuxSym))
+          return false;
       }
       // Pad with zeros.
       if (NumOfAuxSym > YamlSym.AuxEntries.size())
@@ -756,10 +855,10 @@ bool XCOFFWriter::writeXCOFF() {
     return false;
   StartOffset = W.OS.tell();
   writeFileHeader();
-  if (Obj.AuxHeader)
+  if (InitFileHdr.AuxHeaderSize)
     writeAuxFileHeader();
   if (!Obj.Sections.empty()) {
-    writeSectionHeader();
+    writeSectionHeaders();
     if (!writeSectionData())
       return false;
     if (!writeRelocations())
diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
index 398b09c72170..83bf61301387 100644
--- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
@@ -127,6 +127,17 @@ void ScalarEnumerationTraits<XCOFF::StorageMappingClass>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<XCOFF::SymbolType>::enumeration(
+    IO &IO, XCOFF::SymbolType &Value) {
+#define ECase(X) IO.enumCase(Value, #X, XCOFF::X)
+  ECase(XTY_ER);
+  ECase(XTY_SD);
+  ECase(XTY_LD);
+  ECase(XTY_CM);
+#undef ECase
+  IO.enumFallback<Hex8>(Value);
+}
+
 void ScalarEnumerationTraits<XCOFFYAML::AuxSymbolType>::enumeration(
     IO &IO, XCOFFYAML::AuxSymbolType &Type) {
 #define ECase(X) IO.enumCase(Type, #X, XCOFFYAML::X)
@@ -229,6 +240,8 @@ static void auxSymMapping(IO &IO, XCOFFYAML::CsectAuxEnt &AuxSym, bool Is64) {
   IO.mapOptional("ParameterHashIndex", AuxSym.ParameterHashIndex);
   IO.mapOptional("TypeChkSectNum", AuxSym.TypeChkSectNum);
   IO.mapOptional("SymbolAlignmentAndType", AuxSym.SymbolAlignmentAndType);
+  IO.mapOptional("SymbolType", AuxSym.SymbolType);
+  IO.mapOptional("SymbolAlignment", AuxSym.SymbolAlignment);
   IO.mapOptional("StorageMappingClass", AuxSym.StorageMappingClass);
   if (Is64) {
     IO.mapOptional("SectionOrLengthLo", AuxSym.SectionOrLengthLo);
@@ -350,7 +363,8 @@ void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) {
   IO.mapOptional("AuxEntries", S.AuxEntries);
 }
 
-void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTable &Str) {
+void MappingTraits<XCOFFYAML::StringTable>::mapping(
+    IO &IO, XCOFFYAML::StringTable &Str) {
   IO.mapOptional("ContentSize", Str.ContentSize);
   IO.mapOptional("Length", Str.Length);
   IO.mapOptional("Strings", Str.Strings);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 7c306c4a21da..f26d95ab1e47 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -172,6 +172,7 @@
 #include "llvm/Transforms/Instrumentation/KCFI.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
@@ -201,6 +202,7 @@
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
@@ -216,7 +218,6 @@
 #include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopPredication.h"
-#include "llvm/Transforms/Scalar/LoopReroll.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopSink.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6ede86382912..142bd50b3798 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -74,6 +74,7 @@
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
@@ -91,6 +92,7 @@
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
@@ -212,6 +214,12 @@ static cl::opt<bool>
                            cl::desc("Enable DFA jump threading"),
                            cl::init(false), cl::Hidden);
 
+// TODO: turn on and remove flag
+static cl::opt<bool> EnablePGOForceFunctionAttrs(
+    "enable-pgo-force-function-attrs",
+    cl::desc("Enable pass to set function attributes based on PGO profiles"),
+    cl::init(false));
+
 static cl::opt<bool>
     EnableHotColdSplit("hot-cold-split",
                        cl::desc("Enable hot-cold splitting pass"));
@@ -237,6 +245,10 @@ static cl::opt<bool>
     EnableGVNSink("enable-gvn-sink",
                   cl::desc("Enable the GVN sinking pass (default = off)"));
 
+static cl::opt<bool> EnableJumpTableToSwitch(
+    "enable-jump-table-to-switch",
+    cl::desc("Enable JumpTableToSwitch pass (default = off)"));
+
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
 static cl::opt<bool>
@@ -559,6 +571,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
 
+  // Jump table to switch conversion.
+  if (EnableJumpTableToSwitch)
+    FPM.addPass(JumpTableToSwitchPass());
+
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
@@ -1137,6 +1153,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
+  if (EnablePGOForceFunctionAttrs)
+    MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
+
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
 
   if (EnableModuleInliner)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 44511800ccff..093c1f8aaad4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -85,6 +85,7 @@ MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
 MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
+MODULE_PASS("pgo-force-function-attrs", PGOForceFunctionAttrsPass(PGOOpt ? PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -348,6 +349,7 @@ FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM))
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
+FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass());
 FUNCTION_PASS("kcfi", KCFIPass())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
@@ -599,7 +601,6 @@ LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
-LOOP_PASS("loop-reroll", LoopRerollPass())
 LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6b189c314632..80b80f7a26f4 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -234,6 +234,7 @@ class MCDCRecordProcessor {
 
   /// Decision Region to which the ExecutedTestVectorBitmap applies.
   const CounterMappingRegion &Region;
+  const mcdc::DecisionParameters &DecisionParams;
 
   /// Array of branch regions corresponding each conditions in the boolean
   /// expression.
@@ -244,8 +245,8 @@ class MCDCRecordProcessor {
 
   unsigned BitmapIdx;
 
-  /// Mapping of a condition ID to its corresponding branch region.
-  llvm::DenseMap<unsigned, const CounterMappingRegion *> Map;
+  /// Mapping of a condition ID to its corresponding branch params.
+  llvm::DenseMap<unsigned, const mcdc::BranchParameters *> BranchParamsMap;
 
   /// Vector used to track whether a condition is constant folded.
   MCDCRecord::BoolVector Folded;
@@ -253,9 +254,6 @@ class MCDCRecordProcessor {
   /// Mapping of calculated MC/DC Independence Pairs for each condition.
   MCDCRecord::TVPairMap IndependencePairs;
 
-  /// Total number of possible Test Vectors for the boolean expression.
-  MCDCRecord::TestVectors TestVectors;
-
   /// Actual executed Test Vectors for the boolean expression, based on
   /// ExecutedTestVectorBitmap.
   MCDCRecord::TestVectors ExecVectors;
@@ -264,21 +262,24 @@ public:
   MCDCRecordProcessor(const BitVector &Bitmap,
                       const CounterMappingRegion &Region,
                       ArrayRef<const CounterMappingRegion *> Branches)
-      : Bitmap(Bitmap), Region(Region), Branches(Branches),
-        NumConditions(Region.MCDCParams.NumConditions),
-        BitmapIdx(Region.MCDCParams.BitmapIdx * CHAR_BIT),
-        Folded(NumConditions, false), IndependencePairs(NumConditions),
-        TestVectors((size_t)1 << NumConditions) {}
+      : Bitmap(Bitmap), Region(Region),
+        DecisionParams(Region.getDecisionParams()), Branches(Branches),
+        NumConditions(DecisionParams.NumConditions),
+        BitmapIdx(DecisionParams.BitmapIdx * CHAR_BIT),
+        Folded(NumConditions, false), IndependencePairs(NumConditions) {}
 
 private:
   void recordTestVector(MCDCRecord::TestVector &TV, unsigned Index,
                         MCDCRecord::CondState Result) {
+    if (!Bitmap[BitmapIdx + Index])
+      return;
+
     // Copy the completed test vector to the vector of testvectors.
-    TestVectors[Index] = TV;
+    ExecVectors.push_back(TV);
 
     // The final value (T,F) is equal to the last non-dontcare state on the
     // path (in a short-circuiting system).
-    TestVectors[Index].push_back(Result);
+    ExecVectors.back().push_back(Result);
   }
 
   // Walk the binary decision diagram and try assigning both false and true to
@@ -286,18 +287,18 @@ private:
   // the truth table.
   void buildTestVector(MCDCRecord::TestVector &TV, unsigned ID,
                        unsigned Index) {
-    const CounterMappingRegion *Branch = Map[ID];
+    auto [UnusedID, TrueID, FalseID] = *BranchParamsMap[ID];
 
     TV[ID - 1] = MCDCRecord::MCDC_False;
-    if (Branch->MCDCParams.FalseID > 0)
-      buildTestVector(TV, Branch->MCDCParams.FalseID, Index);
+    if (FalseID > 0)
+      buildTestVector(TV, FalseID, Index);
     else
       recordTestVector(TV, Index, MCDCRecord::MCDC_False);
 
     Index |= 1 << (ID - 1);
     TV[ID - 1] = MCDCRecord::MCDC_True;
-    if (Branch->MCDCParams.TrueID > 0)
-      buildTestVector(TV, Branch->MCDCParams.TrueID, Index);
+    if (TrueID > 0)
+      buildTestVector(TV, TrueID, Index);
     else
       recordTestVector(TV, Index, MCDCRecord::MCDC_True);
 
@@ -308,13 +309,11 @@ private:
   /// Walk the bits in the bitmap.  A bit set to '1' indicates that the test
   /// vector at the corresponding index was executed during a test run.
   void findExecutedTestVectors() {
-    for (unsigned Idx = 0; Idx < (1u << NumConditions); ++Idx) {
-      assert(BitmapIdx + Idx < Bitmap.size() && "Bitmap overrun");
-      if (Bitmap[BitmapIdx + Idx] == 0)
-        continue;
-      assert(!TestVectors[Idx].empty() && "Test Vector doesn't exist.");
-      ExecVectors.push_back(TestVectors[Idx]);
-    }
+    // Walk the binary decision diagram to enumerate all possible test vectors.
+    // We start at the root node (ID == 1) with all values being DontCare.
+    // `Index` encodes the bitmask of true values and is initially 0.
+    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
+    buildTestVector(TV, 1, 0);
   }
 
   // Find an independence pair for each condition:
@@ -374,18 +373,13 @@ public:
     // - Record whether the condition is constant folded so that we exclude it
     //   from being measured.
     for (const auto *B : Branches) {
-      Map[B->MCDCParams.ID] = B;
-      PosToID[I] = B->MCDCParams.ID - 1;
+      const auto &BranchParams = B->getBranchParams();
+      BranchParamsMap[BranchParams.ID] = &BranchParams;
+      PosToID[I] = BranchParams.ID - 1;
       CondLoc[I] = B->startLoc();
       Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero());
     }
 
-    // Walk the binary decision diagram to enumerate all possible test vectors.
-    // We start at the root node (ID == 1) with all values being DontCare.
-    // `Index` encodes the bitmask of true values and is initially 0.
-    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
-    buildTestVector(TV, 1, 0);
-
     // Using Profile Bitmap from runtime, mark the executed test vectors.
     findExecutedTestVectors();
 
@@ -394,9 +388,9 @@ public:
     findIndependencePairs();
 
     // Record Test vectors, executed vectors, and independence pairs.
-    MCDCRecord Res(Region, ExecVectors, IndependencePairs, Folded, PosToID,
-                   CondLoc);
-    return Res;
+    return MCDCRecord(Region, std::move(ExecVectors),
+                      std::move(IndependencePairs), std::move(Folded),
+                      std::move(PosToID), std::move(CondLoc));
   }
 };
 
@@ -501,10 +495,12 @@ static unsigned getMaxBitmapSize(const CounterMappingContext &Ctx,
   // Note that `<=` is used insted of `<`, because `BitmapIdx == 0` is valid
   // and `MaxBitmapIdx is `unsigned`. `BitmapIdx` is unique in the record.
   for (const auto &Region : reverse(Record.MappingRegions)) {
-    if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion &&
-        MaxBitmapIdx <= Region.MCDCParams.BitmapIdx) {
-      MaxBitmapIdx = Region.MCDCParams.BitmapIdx;
-      NumConditions = Region.MCDCParams.NumConditions;
+    if (Region.Kind != CounterMappingRegion::MCDCDecisionRegion)
+      continue;
+    const auto &DecisionParams = Region.getDecisionParams();
+    if (MaxBitmapIdx <= DecisionParams.BitmapIdx) {
+      MaxBitmapIdx = DecisionParams.BitmapIdx;
+      NumConditions = DecisionParams.NumConditions;
     }
   }
   unsigned SizeInBits = llvm::alignTo(uint64_t(1) << NumConditions, CHAR_BIT);
@@ -524,6 +520,7 @@ private:
     const CounterMappingRegion *DecisionRegion;
 
     /// They are reflected from DecisionRegion for convenience.
+    mcdc::DecisionParameters DecisionParams;
     LineColPair DecisionStartLoc;
     LineColPair DecisionEndLoc;
 
@@ -533,7 +530,7 @@ private:
 
     /// IDs that are stored in MCDCBranches
     /// Complete when all IDs (1 to NumConditions) are met.
-    DenseSet<CounterMappingRegion::MCDCConditionID> ConditionIDs;
+    DenseSet<mcdc::ConditionID> ConditionIDs;
 
     /// Set of IDs of Expansion(s) that are relevant to DecisionRegion
     /// and its children (via expansions).
@@ -542,7 +539,9 @@ private:
     DenseSet<unsigned> ExpandedFileIDs;
 
     DecisionRecord(const CounterMappingRegion &Decision)
-        : DecisionRegion(&Decision), DecisionStartLoc(Decision.startLoc()),
+        : DecisionRegion(&Decision),
+          DecisionParams(Decision.getDecisionParams()),
+          DecisionStartLoc(Decision.startLoc()),
           DecisionEndLoc(Decision.endLoc()) {
       assert(Decision.Kind == CounterMappingRegion::MCDCDecisionRegion);
     }
@@ -570,17 +569,17 @@ private:
     Result addBranch(const CounterMappingRegion &Branch) {
       assert(Branch.Kind == CounterMappingRegion::MCDCBranchRegion);
 
-      auto ConditionID = Branch.MCDCParams.ID;
+      auto ConditionID = Branch.getBranchParams().ID;
       assert(ConditionID > 0 && "ConditionID should begin with 1");
 
       if (ConditionIDs.contains(ConditionID) ||
-          ConditionID > DecisionRegion->MCDCParams.NumConditions)
+          ConditionID > DecisionParams.NumConditions)
         return NotProcessed;
 
       if (!this->dominates(Branch))
         return NotProcessed;
 
-      assert(MCDCBranches.size() < DecisionRegion->MCDCParams.NumConditions);
+      assert(MCDCBranches.size() < DecisionParams.NumConditions);
 
       // Put `ID=1` in front of `MCDCBranches` for convenience
       // even if `MCDCBranches` is not topological.
@@ -593,9 +592,8 @@ private:
       ConditionIDs.insert(ConditionID);
 
       // `Completed` when `MCDCBranches` is full
-      return (MCDCBranches.size() == DecisionRegion->MCDCParams.NumConditions
-                  ? Completed
-                  : Processed);
+      return (MCDCBranches.size() == DecisionParams.NumConditions ? Completed
+                                                                  : Processed);
     }
 
     /// Record Expansion if it is relevant to this Decision.
@@ -770,7 +768,7 @@ Error CoverageMapping::loadFunctionRecord(
     }
 
     // Save the MC/DC Record so that it can be visualized later.
-    Function.pushMCDCRecord(*Record);
+    Function.pushMCDCRecord(std::move(*Record));
   }
 
   // Don't create records for (filenames, function) pairs we've already seen.
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index ac8e6b56379f..d528d9aa9564 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -244,7 +244,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
   unsigned LineStart = 0;
   for (size_t I = 0; I < NumRegions; ++I) {
     Counter C, C2;
-    uint64_t BIDX = 0, NC = 0, ID = 0, TID = 0, FID = 0;
+    uint64_t BIDX, NC, ID, TID, FID;
+    mcdc::Parameters Params;
     CounterMappingRegion::RegionKind Kind = CounterMappingRegion::CodeRegion;
 
     // Read the combined counter + region kind.
@@ -308,6 +309,13 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
             return Err;
           if (auto Err = readIntMax(FID, std::numeric_limits<unsigned>::max()))
             return Err;
+          if (ID == 0)
+            return make_error<CoverageMapError>(
+                coveragemap_error::malformed,
+                "MCDCConditionID shouldn't be zero");
+          Params = mcdc::BranchParameters{static_cast<unsigned>(ID),
+                                          static_cast<unsigned>(TID),
+                                          static_cast<unsigned>(FID)};
           break;
         case CounterMappingRegion::MCDCDecisionRegion:
           Kind = CounterMappingRegion::MCDCDecisionRegion;
@@ -315,6 +323,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
             return Err;
           if (auto Err = readIntMax(NC, std::numeric_limits<unsigned>::max()))
             return Err;
+          Params = mcdc::DecisionParameters{static_cast<unsigned>(BIDX),
+                                            static_cast<unsigned>(NC)};
           break;
         default:
           return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -370,13 +380,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
     });
 
     auto CMR = CounterMappingRegion(
-        C, C2,
-        CounterMappingRegion::MCDCParameters{
-            static_cast<unsigned>(BIDX), static_cast<unsigned>(NC),
-            static_cast<unsigned>(ID), static_cast<unsigned>(TID),
-            static_cast<unsigned>(FID)},
-        InferredFileID, ExpandedFileID, LineStart, ColumnStart,
-        LineStart + NumLines, ColumnEnd, Kind);
+        C, C2, InferredFileID, ExpandedFileID, LineStart, ColumnStart,
+        LineStart + NumLines, ColumnEnd, Kind, Params);
     if (CMR.startLoc() > CMR.endLoc())
       return make_error<CoverageMapError>(
           coveragemap_error::malformed,
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 27727f216b05..3267afdbe15c 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -213,6 +213,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
     }
     Counter Count = Minimizer.adjust(I->Count);
     Counter FalseCount = Minimizer.adjust(I->FalseCount);
+    bool ParamsShouldBeNull = true;
     switch (I->Kind) {
     case CounterMappingRegion::CodeRegion:
     case CounterMappingRegion::GapRegion:
@@ -251,16 +252,25 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
                     OS);
       writeCounter(MinExpressions, Count, OS);
       writeCounter(MinExpressions, FalseCount, OS);
-      encodeULEB128(unsigned(I->MCDCParams.ID), OS);
-      encodeULEB128(unsigned(I->MCDCParams.TrueID), OS);
-      encodeULEB128(unsigned(I->MCDCParams.FalseID), OS);
+      {
+        const auto &BranchParams = I->getBranchParams();
+        ParamsShouldBeNull = false;
+        assert(BranchParams.ID > 0);
+        encodeULEB128(static_cast<unsigned>(BranchParams.ID), OS);
+        encodeULEB128(static_cast<unsigned>(BranchParams.TrueID), OS);
+        encodeULEB128(static_cast<unsigned>(BranchParams.FalseID), OS);
+      }
       break;
     case CounterMappingRegion::MCDCDecisionRegion:
       encodeULEB128(unsigned(I->Kind)
                         << Counter::EncodingCounterTagAndExpansionRegionTagBits,
                     OS);
-      encodeULEB128(unsigned(I->MCDCParams.BitmapIdx), OS);
-      encodeULEB128(unsigned(I->MCDCParams.NumConditions), OS);
+      {
+        const auto &DecisionParams = I->getDecisionParams();
+        ParamsShouldBeNull = false;
+        encodeULEB128(static_cast<unsigned>(DecisionParams.BitmapIdx), OS);
+        encodeULEB128(static_cast<unsigned>(DecisionParams.NumConditions), OS);
+      }
       break;
     }
     assert(I->LineStart >= PrevLineStart);
@@ -270,6 +280,9 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
     encodeULEB128(I->LineEnd - I->LineStart, OS);
     encodeULEB128(I->ColumnEnd, OS);
     PrevLineStart = I->LineStart;
+    assert((!ParamsShouldBeNull || std::get_if<0>(&I->MCDCParams)) &&
+           "MCDCParams should be empty");
+    (void)ParamsShouldBeNull;
   }
   // Ensure that all file ids have at least one mapping region.
   assert(CurrentFileID == (VirtualFileMapping.size() - 1));
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 9ebcba10c860..4c89834654d4 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -560,35 +560,46 @@ Error InstrProfSymtab::initVTableNamesFromCompressedStrings(
       std::bind(&InstrProfSymtab::addVTableName, this, std::placeholders::_1));
 }
 
-Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
-  if (Error E = addFuncName(PGOFuncName))
-    return E;
-  MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+StringRef InstrProfSymtab::getCanonicalName(StringRef PGOName) {
   // In ThinLTO, local function may have been promoted to global and have
   // suffix ".llvm." added to the function name. We need to add the
   // stripped function name to the symbol table so that we can find a match
   // from profile.
   //
-  // We may have other suffixes similar as ".llvm." which are needed to
-  // be stripped before the matching, but ".__uniq." suffix which is used
-  // to differentiate internal linkage functions in different modules
-  // should be kept. Now this is the only suffix with the pattern ".xxx"
-  // which is kept before matching.
+  // ".__uniq." suffix is used to differentiate internal linkage functions in
+  // different modules and should be kept. This is the only suffix with the
+  // pattern ".xxx" which is kept before matching, other suffixes similar as
+  // ".llvm." will be stripped.
   const std::string UniqSuffix = ".__uniq.";
-  auto pos = PGOFuncName.find(UniqSuffix);
-  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise
-  // search '.' from the beginning.
-  if (pos != std::string::npos)
+  size_t pos = PGOName.find(UniqSuffix);
+  if (pos != StringRef::npos)
     pos += UniqSuffix.length();
   else
     pos = 0;
-  pos = PGOFuncName.find('.', pos);
-  if (pos != std::string::npos && pos != 0) {
-    StringRef OtherFuncName = PGOFuncName.substr(0, pos);
-    if (Error E = addFuncName(OtherFuncName))
+
+  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise search '.' from
+  // the beginning.
+  pos = PGOName.find('.', pos);
+  if (pos != StringRef::npos && pos != 0)
+    return PGOName.substr(0, pos);
+
+  return PGOName;
+}
+
+Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
+  auto mapName = [&](StringRef Name) -> Error {
+    if (Error E = addFuncName(Name))
       return E;
-    MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
-  }
+    MD5FuncMap.emplace_back(Function::getGUID(Name), &F);
+    return Error::success();
+  };
+  if (Error E = mapName(PGOFuncName))
+    return E;
+
+  StringRef CanonicalFuncName = getCanonicalName(PGOFuncName);
+  if (CanonicalFuncName != PGOFuncName)
+    return mapName(CanonicalFuncName);
+
   return Error::success();
 }
 
diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp
index 341de244547c..d763cded6e7e 100644
--- a/llvm/lib/Support/DeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -83,9 +83,9 @@ bool DeltaAlgorithm::Search(const changeset_ty &Changes,
     if (Sets.size() > 2) {
       // FIXME: This is really slow.
       changeset_ty Complement;
-      std::set_difference(
-        Changes.begin(), Changes.end(), it->begin(), it->end(),
-        std::insert_iterator<changeset_ty>(Complement, Complement.begin()));
+      std::set_difference(Changes.begin(), Changes.end(), it->begin(),
+                          it->end(),
+                          std::inserter(Complement, Complement.begin()));
       if (GetTestResult(Complement)) {
         changesetlist_ty ComplementSets;
         ComplementSets.insert(ComplementSets.end(), Sets.begin(), it);
diff --git a/llvm/lib/Support/PGOOptions.cpp b/llvm/lib/Support/PGOOptions.cpp
index 7e57b52e4ba2..5981dff9e094 100644
--- a/llvm/lib/Support/PGOOptions.cpp
+++ b/llvm/lib/Support/PGOOptions.cpp
@@ -15,11 +15,12 @@ PGOOptions::PGOOptions(std::string ProfileFile, std::string CSProfileGenFile,
                        std::string ProfileRemappingFile,
                        std::string MemoryProfile,
                        IntrusiveRefCntPtr<vfs::FileSystem> FS, PGOAction Action,
-                       CSPGOAction CSAction, bool DebugInfoForProfiling,
-                       bool PseudoProbeForProfiling, bool AtomicCounterUpdate)
+                       CSPGOAction CSAction, ColdFuncOpt ColdType,
+                       bool DebugInfoForProfiling, bool PseudoProbeForProfiling,
+                       bool AtomicCounterUpdate)
     : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
       ProfileRemappingFile(ProfileRemappingFile), MemoryProfile(MemoryProfile),
-      Action(Action), CSAction(CSAction),
+      Action(Action), CSAction(CSAction), ColdOptType(ColdType),
       DebugInfoForProfiling(DebugInfoForProfiling ||
                             (Action == SampleUse && !PseudoProbeForProfiling)),
       PseudoProbeForProfiling(PseudoProbeForProfiling),
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 545643234f69..99d866a23a68 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -849,7 +849,8 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
 
   do {
     // Skip all symbols to the line end.
-    prepSkipToLineEnd();
+    while (*CurPtr != '\n')
+      ++CurPtr;
 
     // Find the first non-whitespace symbol in the next line(s).
     if (!prepSkipLineBegin())
@@ -1032,11 +1033,6 @@ bool TGLexer::prepSkipDirectiveEnd() {
   return true;
 }
 
-void TGLexer::prepSkipToLineEnd() {
-  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end())
-    ++CurPtr;
-}
-
 bool TGLexer::prepIsProcessingEnabled() {
   for (const PreprocessorControlDesc &I :
        llvm::reverse(*PrepIncludeStack.back()))
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 25dcd9fbadef..9adc03ccc72b 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -467,11 +467,6 @@ private:
   // directive.
   bool prepSkipDirectiveEnd();
 
-  // Skip all symbols to the end of the line/file.
-  // The method adjusts CurPtr, so that it points to either new line
-  // symbol in the current line or the buffer end.
-  void prepSkipToLineEnd();
-
   // Return true, if the current preprocessor control stack is such that
   // we should allow lexer to process the next token, false - otherwise.
   //
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 02fb01caf7e8..3377fcf14472 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1120,7 +1120,6 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
                                     FeatureFuseCryptoEOR,
-                                    FeatureFuseAdrpAdd,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMove,
@@ -1149,6 +1148,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAddress,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
@@ -1165,6 +1165,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureArithmeticCbzFusion,
                                     FeatureDisableLatencySchedHeuristic,
                                     FeatureFuseAddress,
+                                    FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCCSelect,
@@ -1349,6 +1350,7 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    "Ampere Computing Ampere-1 processors", [
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureAggressiveFMA,
@@ -1364,6 +1366,7 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
                                     FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
@@ -1371,8 +1374,26 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureCmpBccFusion,
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
+                                    FeatureFuseAddSub2RegAndConstOne,
+                                    FeatureStorePairSuppress,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
+
+def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
+                                    "Ampere Computing Ampere-1B processors", [
+                                    FeaturePostRAScheduler,
+                                    FeatureFuseAES,
+                                    FeatureFuseAdrpAdd,
+                                    FeatureAddrLSLFast,
+                                    FeatureALULSLFast,
+                                    FeatureAggressiveFMA,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureCmpBccFusion,
+                                    FeatureFuseAddress,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
+                                    FeatureEnableSelectOptimize,
+                                    FeaturePredictableSelectIsExpensive,
                                     FeatureLdpAlignedOnly,
                                     FeatureStpAlignedOnly]>;
 
@@ -1530,6 +1551,11 @@ def ProcessorFeatures {
                                      FeatureMTE, FeatureSSBS, FeatureRandGen,
                                      FeatureSB, FeatureSM4, FeatureSHA2,
                                      FeatureSHA3, FeatureAES];
+  list<SubtargetFeature> Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureMTE, FeatureSSBS, FeatureRandGen,
+                                     FeatureSB, FeatureSM4, FeatureSHA2,
+                                     FeatureSHA3, FeatureAES, FeatureCSSC,
+                                     FeatureWFxT];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
@@ -1697,6 +1723,9 @@ def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
 def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A,
                      [TuneAmpere1A]>;
 
+def : ProcessorModel<"ampere1b", Ampere1Model, ProcessorFeatures.Ampere1B,
+                     [TuneAmpere1B]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 91b4f18c73c9..c62582ac01a4 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -24,11 +24,13 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 using OperandBundleDef = OperandBundleDefT<Value *>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index de247253eb18..5b5ffd7b2feb 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1602,7 +1602,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   // attributes (isCall, isReturn, etc.). We lower them to the real
   // instruction here.
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
   case AArch64::TCRETURNriALL: {
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1daa7d5fe6a7..fdea974d4540 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -288,5 +288,6 @@ def AArch64PostLegalizerCombiner
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
-                        select_to_minmax, or_to_bsp]> {
+                        select_to_minmax, or_to_bsp,
+                        commute_constant_to_rhs]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1af064b6de3c..b2c52b443753 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -774,6 +774,39 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
   return true;
 }
 
+// Create a call to CallTarget, copying over all the operands from *MBBI,
+// starting at the regmask.
+static MachineInstr *createCall(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const AArch64InstrInfo *TII,
+                                MachineOperand &CallTarget,
+                                unsigned RegMaskStartIdx) {
+  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+  MachineInstr *Call =
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opc)).getInstr();
+
+  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+         "invalid operand for regular call");
+  Call->addOperand(CallTarget);
+
+  // Register arguments are added during ISel, but cannot be added as explicit
+  // operands of the branch as it expects to be B <target> which is only one
+  // operand. Instead they are implicit operands used by the branch.
+  while (!MBBI->getOperand(RegMaskStartIdx).isRegMask()) {
+    auto MOP = MBBI->getOperand(RegMaskStartIdx);
+    assert(MOP.isReg() && "can only add register operands");
+    Call->addOperand(MachineOperand::CreateReg(
+        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
+        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
+    RegMaskStartIdx++;
+  }
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MBBI->operands(), RegMaskStartIdx))
+    Call->addOperand(MO);
+
+  return Call;
+}
+
 bool AArch64ExpandPseudo::expandCALL_RVMARKER(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   // Expand CALL_RVMARKER pseudo to:
@@ -782,31 +815,12 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
   // - another branch, to the runtime function
   // Mark the sequence as bundle, to avoid passes moving other code in between.
   MachineInstr &MI = *MBBI;
-
-  MachineInstr *OriginalCall;
   MachineOperand &RVTarget = MI.getOperand(0);
-  MachineOperand &CallTarget = MI.getOperand(1);
-  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
-         "invalid operand for regular call");
   assert(RVTarget.isGlobal() && "invalid operand for attached call");
-  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
-  OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  OriginalCall->addOperand(CallTarget);
-
-  unsigned RegMaskStartIdx = 2;
-  // Skip register arguments. Those are added during ISel, but are not
-  // needed for the concrete branch.
-  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
-    auto MOP = MI.getOperand(RegMaskStartIdx);
-    assert(MOP.isReg() && "can only add register operands");
-    OriginalCall->addOperand(MachineOperand::CreateReg(
-        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
-        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
-    RegMaskStartIdx++;
-  }
-  for (const MachineOperand &MO :
-       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
-    OriginalCall->addOperand(MO);
+  MachineInstr *OriginalCall =
+      createCall(MBB, MBBI, TII, MI.getOperand(1),
+                 // Regmask starts after the RV and call targets.
+                 /*RegMaskStartIdx=*/2);
 
   BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
                      .addReg(AArch64::FP, RegState::Define)
@@ -834,31 +848,10 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
   // - a BTI instruction
   // Mark the sequence as a bundle, to avoid passes moving other code in
   // between.
-
   MachineInstr &MI = *MBBI;
-  MachineOperand &CallTarget = MI.getOperand(0);
-  assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
-         "invalid operand for regular call");
-  unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
-  MachineInstr *Call =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  Call->addOperand(CallTarget);
-
-  // 1 because we already added the branch target above.
-  unsigned RegMaskStartIdx = 1;
-  // The branch is BL <target>, so we cannot attach the arguments of the called
-  // function to it. Those must be added as implicitly used by the branch.
-  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
-    auto MOP = MI.getOperand(RegMaskStartIdx);
-    assert(MOP.isReg() && "can only add register operands");
-    Call->addOperand(MachineOperand::CreateReg(
-        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
-        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
-    RegMaskStartIdx++;
-  }
-  for (const MachineOperand &MO :
-       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
-    Call->addOperand(MO);
+  MachineInstr *Call = createCall(MBB, MBBI, TII, MI.getOperand(0),
+                                  // Regmask starts after the call target.
+                                  /*RegMaskStartIdx=*/1);
 
   Call->setCFIType(*MBB.getParent(), MI.getCFIType());
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939b0438..a3b7e3128ac1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20246,9 +20246,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_sve_uqsub_x:
     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
-  case Intrinsic::aarch64_sve_urshr:
-    return DAG.getNode(AArch64ISD::URSHR_I_PRED, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_asrd:
     return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
@@ -24615,7 +24612,8 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
     return;
   }
 
-  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1)
+  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+      !VT.isVector())
     return replaceBoolVectorBitcast(N, Results, DAG);
 
   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
@@ -25700,7 +25698,9 @@ AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
   case AArch64::BLR:
   case AArch64::BLRNoIP:
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
     break;
   default:
     llvm_unreachable("Unexpected CFI call opcode");
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9add7d87017a..39c96092f103 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2503,7 +2503,9 @@ bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
     return false;
   case AArch64::TCRETURNdi:
   case AArch64::TCRETURNri:
-  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNrix16x17:
+  case AArch64::TCRETURNrix17:
+  case AArch64::TCRETURNrinotx16:
   case AArch64::TCRETURNriALL:
     return true;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 77fdb688d042..9c3a6927d043 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -928,8 +928,25 @@ let RecomputePerFunction = 1 in {
   // Avoid generating STRQro if it is slow, unless we're optimizing for code size.
   def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">;
 
-  def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
-  def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
+  // Register restrictions for indirect tail-calls:
+  // - If branch target enforcement is enabled, indirect calls must use x16 or
+  //   x17, because these are the only registers which can target the BTI C
+  //   instruction.
+  // - If PAuthLR is enabled, x16 is used in the epilogue to hold the address
+  //   of the signing instruction. This can't be changed because it is used by a
+  //   HINT instruction which only accepts x16. We can't load anything from the
+  //   stack after this because the authentication instruction checks that SP is
+  //   the same as it was at function entry, so we can't have anything on the
+  //   stack.
+
+  // BTI on, PAuthLR off: x16 or x17
+  def TailCallX16X17 : Predicate<[{  MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && !MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI on, PAuthLR on: x17 only
+  def TailCallX17 : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI off, PAuthLR on: Any non-callee-saved register except x16
+  def TailCallNotX16 : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
+  // BTI off, PAuthLR off: Any non-callee-saved register
+  def TailCallAny : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() && !MF->getInfo<AArch64FunctionInfo>()->branchProtectionPAuthLR() }]>;
 
   def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
   def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
@@ -9121,18 +9138,30 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   // some verifier checks for outlined functions.
   def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>,
                       Sched<[WriteBrReg]>;
-  // Indirect tail-call limited to only use registers (x16 and x17) which are
-  // allowed to tail-call a "BTI c" instruction.
-  def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>,
+
+  // Indirect tail-calls with reduced register classes, needed for BTI and
+  // PAuthLR.
+  def TCRETURNrix16x17 : Pseudo<(outs), (ins tcGPRx16x17:$dst, i32imm:$FPDiff), []>,
+                      Sched<[WriteBrReg]>;
+  def TCRETURNrix17 : Pseudo<(outs), (ins tcGPRx17:$dst, i32imm:$FPDiff), []>,
+                      Sched<[WriteBrReg]>;
+  def TCRETURNrinotx16 : Pseudo<(outs), (ins tcGPRnotx16:$dst, i32imm:$FPDiff), []>,
                       Sched<[WriteBrReg]>;
 }
 
 def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
           (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>,
-      Requires<[NotUseBTI]>;
-def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>,
-      Requires<[UseBTI]>;
+      Requires<[TailCallAny]>;
+def : Pat<(AArch64tcret tcGPRx16x17:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrix16x17 tcGPRx16x17:$dst, imm:$FPDiff)>,
+      Requires<[TailCallX16X17]>;
+def : Pat<(AArch64tcret tcGPRx17:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrix17 tcGPRx17:$dst, imm:$FPDiff)>,
+      Requires<[TailCallX17]>;
+def : Pat<(AArch64tcret tcGPRnotx16:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNrinotx16 tcGPRnotx16:$dst, imm:$FPDiff)>,
+      Requires<[TailCallNotX16]>;
+
 def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 1e12cf545fa7..37d621cd2f65 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -23,11 +23,13 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
+using namespace llvm::object;
 
 extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index b70ab8568884..569944e0e660 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -217,11 +217,16 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2
                                                      X22, X23, X24, X25, X26,
                                                      X27, X28, FP, LR)>;
 
-// Restricted set of tail call registers, for use when branch target
-// enforcement is enabled. These are the only registers which can be used to
-// indirectly branch (not call) to the "BTI c" instruction at the start of a
-// BTI-protected function.
-def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;
+// Restricted sets of tail call registers, for use when branch target
+// enforcement or PAuthLR are enabled.
+// For BTI, x16 and x17 are the only registers which can be used to indirectly
+// branch (not call) to the "BTI c" instruction at the start of a BTI-protected
+// function.
+// For PAuthLR, x16 must be used in the function epilogue for other purposes,
+// so cannot hold the function pointer.
+def tcGPRx17 : RegisterClass<"AArch64", [i64], 64, (add X17)>;
+def tcGPRx16x17 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;
+def tcGPRnotx16 : RegisterClass<"AArch64", [i64], 64, (sub tcGPR64, X16)>;
 
 // Register set that excludes registers that are reserved for procedure calls.
 // This is used for pseudo-instructions that are actually implemented using a
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e83d8e5bde79..2b0524754b2e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -232,7 +232,11 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
 ]>;
 
 def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
-def AArch64urshri_p : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
+def AArch64urshri_p_node : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
+
+def AArch64urshri_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_urshr node:$op1, node:$op2, node:$op3),
+                            (AArch64urshri_p_node node:$op1, node:$op2, node:$op3)]>;
 
 def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 6550c1272216..2b01deb0cca1 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -296,6 +296,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case Ampere1:
   case Ampere1A:
+  case Ampere1B:
     CacheLineSize = 64;
     PrefFunctionAlignment = Align(64);
     PrefLoopAlignment = Align(64);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 0292c018f1db..01cc4713849f 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ public:
     A64FX,
     Ampere1,
     Ampere1A,
+    Ampere1B,
     AppleA7,
     AppleA10,
     AppleA11,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2..af0b6873d170 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3809,6 +3809,10 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   }
 
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector && LT.second.isFixedLengthVector())
+    Kind = TTI::SK_PermuteSingleSrc;
 
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
@@ -3971,6 +3975,9 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e9d96f3b838d..4e7c8f66d9d5 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -4809,20 +4809,30 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
       return parseCondCode(Operands, invertCondCode);
 
     // If it's a register name, parse it.
-    if (!parseRegister(Operands))
+    if (!parseRegister(Operands)) {
+      // Parse an optional shift/extend modifier.
+      AsmToken SavedTok = getTok();
+      if (parseOptionalToken(AsmToken::Comma)) {
+        // The operand after the register may be a label (e.g. ADR/ADRP).  Check
+        // such cases and don't report an error when <label> happens to match a
+        // shift/extend modifier.
+        ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic,
+                                                 /*ParseForAllFeatures=*/true);
+        if (!Res.isNoMatch())
+          return Res.isFailure();
+        Res = tryParseOptionalShiftExtend(Operands);
+        if (!Res.isNoMatch())
+          return Res.isFailure();
+        getLexer().UnLex(SavedTok);
+      }
       return false;
+    }
 
     // See if this is a "mul vl" decoration or "mul #<int>" operand used
     // by SVE instructions.
     if (!parseOptionalMulOperand(Operands))
       return false;
 
-    // This could be an optional "shift" or "extend" operand.
-    ParseStatus GotShift = tryParseOptionalShiftExtend(Operands);
-    // We can only continue if no tokens were eaten.
-    if (!GotShift.isNoMatch())
-      return GotShift.isFailure();
-
     // If this is a two-word mnemonic, parse its special keyword
     // operand as an identifier.
     if (Mnemonic == "brb" || Mnemonic == "smstart" || Mnemonic == "smstop" ||
@@ -4883,6 +4893,15 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 
     E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
     Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+
+    // Parse an optional shift/extend modifier.
+    AsmToken SavedTok = Tok;
+    if (parseOptionalToken(AsmToken::Comma)) {
+      ParseStatus Res = tryParseOptionalShiftExtend(Operands);
+      if (!Res.isNoMatch())
+        return Res.isFailure();
+      getLexer().UnLex(SavedTok);
+    }
     return false;
   }
   case AsmToken::Equal: {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 55cad848393a..3dc3d31a34e8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -1012,16 +1012,23 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization(
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                               bool IsTailCall) {
+  const AArch64FunctionInfo *FuncInfo = CallerF.getInfo<AArch64FunctionInfo>();
+
   if (!IsTailCall)
     return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
 
   if (!IsIndirect)
     return AArch64::TCRETURNdi;
 
-  // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
-  // x16 or x17.
-  if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
-    return AArch64::TCRETURNriBTI;
+  // When BTI or PAuthLR are enabled, there are restrictions on using x16 and
+  // x17 to hold the function pointer.
+  if (FuncInfo->branchTargetEnforcement()) {
+    if (FuncInfo->branchProtectionPAuthLR())
+      return AArch64::TCRETURNrix17;
+    else
+      return AArch64::TCRETURNrix16x17;
+  } else if (FuncInfo->branchProtectionPAuthLR())
+    return AArch64::TCRETURNrinotx16;
 
   return AArch64::TCRETURNri;
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 2515991fbea1..ac804857be01 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2091,7 +2091,7 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
   case AArch64::G_DUP: {
     // Convert the type from p0 to s64 to help selection.
     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
-    if (!DstTy.getElementType().isPointer())
+    if (!DstTy.isPointerVector())
       return false;
     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
     MRI.setType(I.getOperand(0).getReg(),
@@ -3020,34 +3020,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_INDEXED_STORE:
     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
 
-  case TargetOpcode::G_SMULH:
-  case TargetOpcode::G_UMULH: {
-    // Reject the various things we don't support yet.
-    if (unsupportedBinOp(I, RBI, MRI, TRI))
-      return false;
-
-    const Register DefReg = I.getOperand(0).getReg();
-    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
-    if (RB.getID() != AArch64::GPRRegBankID) {
-      LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
-      return false;
-    }
-
-    if (Ty != LLT::scalar(64)) {
-      LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
-                        << ", expected: " << LLT::scalar(64) << '\n');
-      return false;
-    }
-
-    unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
-                                                             : AArch64::UMULHrr;
-    I.setDesc(TII.get(NewOpc));
-
-    // Now that we selected an opcode, we need to constrain the register
-    // operands to use appropriate classes.
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  }
   case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
     if (MRI.getType(I.getOperand(0).getReg()).isVector())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index cbf5655706e6..933f13dd5a19 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -343,10 +343,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
     const LLT &ValTy = Query.Types[0];
-    if (!ValTy.isVector())
-      return false;
-    const LLT EltTy = ValTy.getElementType();
-    return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
+    return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
   getActionDefinitionsBuilder(G_LOAD)
@@ -521,7 +518,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
             const LLT &SrcTy = Query.Types[1];
-            return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
+            return Ty.isVector() && !SrcTy.isPointerVector() &&
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
@@ -555,7 +552,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
             const LLT &SrcTy = Query.Types[1];
-            return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
+            return Ty.isVector() && !SrcTy.isPointerVector() &&
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
@@ -615,9 +612,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf([=](const LegalityQuery &Query) {
         LLT DstTy = Query.Types[0];
         LLT SrcTy = Query.Types[1];
-        return DstTy.isVector() && (SrcTy.getSizeInBits() > 128 ||
-                                    (DstTy.getScalarSizeInBits() * 2 <
-                                     SrcTy.getScalarSizeInBits()));
+        return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
+               DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
       })
 
       .alwaysLegal();
@@ -1649,7 +1645,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
     return true;
   }
 
-  if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
+  if (!ValTy.isPointerVector() ||
       ValTy.getElementType().getAddressSpace() != 0) {
     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
     return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index b8e5e7bbdaba..0fc4d7f19910 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -273,7 +273,9 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::GPR64common_and_GPR64noipRegClassID:
   case AArch64::GPR64noip_and_tcGPR64RegClassID:
   case AArch64::tcGPR64RegClassID:
-  case AArch64::rtcGPR64RegClassID:
+  case AArch64::tcGPRx16x17RegClassID:
+  case AArch64::tcGPRx17RegClassID:
+  case AArch64::tcGPRnotx16RegClassID:
   case AArch64::WSeqPairsClassRegClassID:
   case AArch64::XSeqPairsClassRegClassID:
   case AArch64::MatrixIndexGPR32_8_11RegClassID:
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e3f1d25d68ff..ed8336a2e8ad 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -248,34 +248,6 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) {
   return false;
 }
 
-static inline std::optional<std::string>
-getArm64ECMangledFunctionName(std::string Name) {
-  bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.find("$$h") != std::string::npos)
-    return std::nullopt;
-  if (!IsCppFn && Name[0] == '#')
-    return std::nullopt;
-
-  StringRef Prefix = "$$h";
-  size_t InsertIdx = 0;
-  if (IsCppFn) {
-    InsertIdx = Name.find("@@");
-    size_t ThreeAtSignsIdx = Name.find("@@@");
-    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
-      InsertIdx += 2;
-    } else {
-      InsertIdx = Name.find("@");
-      if (InsertIdx != std::string::npos)
-        InsertIdx++;
-    }
-  } else {
-    Prefix = "#";
-  }
-
-  Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end());
-  return std::optional<std::string>(Name);
-}
-
 namespace AArch64CC {
 
 // The CondCodes constants map directly to the 4-bit encoding of the condition
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 55dbc1a803e1..4ab2b124ef53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1002,6 +1002,12 @@ def FeatureGWS : SubtargetFeature<"gws",
   "Has Global Wave Sync"
 >;
 
+def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
+  "RequiresCOV6",
+  "true",
+  "Target Requires Code Object V6"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -1212,6 +1218,17 @@ def FeatureISAVersion9_0_Common : FeatureSet<
    FeatureImageInsts,
    FeatureMadMacF32Insts]>;
 
+def FeatureISAVersion9_0_Consumer_Common : FeatureSet<
+  !listconcat(FeatureISAVersion9_0_Common.Features,
+    [FeatureImageGather4D16Bug,
+     FeatureDsSrc2Insts,
+     FeatureExtendedImageInsts,
+     FeatureGDS])>;
+
+def FeatureISAVersion9_Generic : FeatureSet<
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion9_0_MI_Common : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Common.Features,
     [FeatureFmaMixInsts,
@@ -1230,43 +1247,27 @@ def FeatureISAVersion9_0_MI_Common : FeatureSet<
      FeatureSupportsSRAMECC])>;
 
 def FeatureISAVersion9_0_0 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_0_2 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_0_4 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureFmaMixInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureFmaMixInsts])>;
 
 def FeatureISAVersion9_0_6 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     HalfRate64Ops,
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [HalfRate64Ops,
      FeatureFmaMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
      FeatureDLInsts,
      FeatureDot1Insts,
      FeatureDot2Insts,
      FeatureDot7Insts,
      FeatureDot10Insts,
-     FeatureSupportsSRAMECC,
-     FeatureImageGather4D16Bug])>;
+     FeatureSupportsSRAMECC])>;
 
 def FeatureISAVersion9_0_8 : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
@@ -1279,13 +1280,9 @@ def FeatureISAVersion9_0_8 : FeatureSet<
      FeatureImageGather4D16Bug])>;
 
 def FeatureISAVersion9_0_9 : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts,
+     FeatureImageInsts])>;
 
 def FeatureISAVersion9_0_A : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
@@ -1301,12 +1298,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
      FeatureKernargPreload])>;
 
 def FeatureISAVersion9_0_C : FeatureSet<
-  !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureGDS,
-     FeatureMadMixInsts,
-     FeatureDsSrc2Insts,
-     FeatureExtendedImageInsts,
-     FeatureImageGather4D16Bug])>;
+  !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
+    [FeatureMadMixInsts])>;
 
 def FeatureISAVersion9_4_Common : FeatureSet<
   [FeatureGFX9,
@@ -1387,6 +1380,10 @@ def FeatureISAVersion10_1_Common : FeatureSet<
      FeatureFlatSegmentOffsetBug,
      FeatureNegativeUnalignedScratchOffsetBug])>;
 
+def FeatureISAVersion10_1_Generic : FeatureSet<
+  !listconcat(FeatureISAVersion10_1_Common.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion10_1_0 : FeatureSet<
   !listconcat(FeatureISAVersion10_1_Common.Features,
     [])>;
@@ -1426,6 +1423,10 @@ def FeatureISAVersion10_3_0 : FeatureSet<
      FeatureDot10Insts,
      FeatureShaderCyclesRegister])>;
 
+def FeatureISAVersion10_3_Generic: FeatureSet<
+  !listconcat(FeatureISAVersion10_3_0.Features,
+    [FeatureRequiresCOV6])>;
+
 def FeatureISAVersion11_Common : FeatureSet<
   [FeatureGFX11,
    FeatureLDSBankCount32,
@@ -1447,6 +1448,16 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard]>;
 
+// There are few workarounds that need to be
+// added to all targets. This pessimizes codegen
+// a bit on the generic GFX11 target.
+def FeatureISAVersion11_Generic: FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureMSAALoadDstSelBug,
+     FeatureVALUTransUseHazard,
+     FeatureUserSGPRInit16Bug,
+     FeatureRequiresCOV6])>;
+
 def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureMSAALoadDstSelBug,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index db81e1ee9e38..37a36b26b947 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -156,6 +156,13 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
 
+  // TODO: We're checking this late, would be nice to check it earlier.
+  if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
+    report_fatal_error(
+        STM.getCPU() + " is only available on code object version 6 or better",
+        /*gen_crash_diag*/ false);
+  }
+
   // TODO: Which one is called first, emitStartOfAsmFile or
   // emitFunctionBodyStart?
   if (!getTargetStreamer()->getTargetID())
@@ -197,7 +204,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   if (MFI.getNumKernargPreloadedSGPRs() > 0) {
     assert(AMDGPU::hasKernargPreload(STM));
-    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
+    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
+                                                  STM.isAmdHsaOS());
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 10569d97248b..528257ead585 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -308,8 +308,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
+  setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
+
   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+  setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 97952de3e6a3..0d3b158a0df7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -239,6 +239,7 @@ static bool isRegisterVectorType(LLT Ty) {
          EltSize == 128 || EltSize == 256;
 }
 
+// TODO: replace all uses of isRegisterType with isRegisterClassType
 static bool isRegisterType(LLT Ty) {
   if (!isRegisterSize(Ty.getSizeInBits()))
     return false;
@@ -258,6 +259,8 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 }
 
 // RegisterType that doesn't have a corresponding RegClass.
+// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
+// should be removed.
 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     LLT Ty = Query.Types[TypeIdx];
@@ -276,6 +279,82 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
   };
 }
 
+static const LLT S1 = LLT::scalar(1);
+static const LLT S8 = LLT::scalar(8);
+static const LLT S16 = LLT::scalar(16);
+static const LLT S32 = LLT::scalar(32);
+static const LLT S64 = LLT::scalar(64);
+static const LLT S96 = LLT::scalar(96);
+static const LLT S128 = LLT::scalar(128);
+static const LLT S160 = LLT::scalar(160);
+static const LLT S224 = LLT::scalar(224);
+static const LLT S256 = LLT::scalar(256);
+static const LLT S512 = LLT::scalar(512);
+static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
+
+static const LLT V2S8 = LLT::fixed_vector(2, 8);
+static const LLT V2S16 = LLT::fixed_vector(2, 16);
+static const LLT V4S16 = LLT::fixed_vector(4, 16);
+static const LLT V6S16 = LLT::fixed_vector(6, 16);
+static const LLT V8S16 = LLT::fixed_vector(8, 16);
+static const LLT V10S16 = LLT::fixed_vector(10, 16);
+static const LLT V12S16 = LLT::fixed_vector(12, 16);
+static const LLT V16S16 = LLT::fixed_vector(16, 16);
+
+static const LLT V2S32 = LLT::fixed_vector(2, 32);
+static const LLT V3S32 = LLT::fixed_vector(3, 32);
+static const LLT V4S32 = LLT::fixed_vector(4, 32);
+static const LLT V5S32 = LLT::fixed_vector(5, 32);
+static const LLT V6S32 = LLT::fixed_vector(6, 32);
+static const LLT V7S32 = LLT::fixed_vector(7, 32);
+static const LLT V8S32 = LLT::fixed_vector(8, 32);
+static const LLT V9S32 = LLT::fixed_vector(9, 32);
+static const LLT V10S32 = LLT::fixed_vector(10, 32);
+static const LLT V11S32 = LLT::fixed_vector(11, 32);
+static const LLT V12S32 = LLT::fixed_vector(12, 32);
+static const LLT V16S32 = LLT::fixed_vector(16, 32);
+static const LLT V32S32 = LLT::fixed_vector(32, 32);
+
+static const LLT V2S64 = LLT::fixed_vector(2, 64);
+static const LLT V3S64 = LLT::fixed_vector(3, 64);
+static const LLT V4S64 = LLT::fixed_vector(4, 64);
+static const LLT V5S64 = LLT::fixed_vector(5, 64);
+static const LLT V6S64 = LLT::fixed_vector(6, 64);
+static const LLT V7S64 = LLT::fixed_vector(7, 64);
+static const LLT V8S64 = LLT::fixed_vector(8, 64);
+static const LLT V16S64 = LLT::fixed_vector(16, 64);
+
+static const LLT V2S128 = LLT::fixed_vector(2, 128);
+static const LLT V4S128 = LLT::fixed_vector(4, 128);
+
+static std::initializer_list<LLT> AllScalarTypes = {S32,  S64,  S96,  S128,
+                                                    S160, S224, S256, S512};
+
+static std::initializer_list<LLT> AllS16Vectors{
+    V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
+
+static std::initializer_list<LLT> AllS32Vectors = {
+    V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
+    V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
+
+static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
+                                                   V6S64, V7S64, V8S64, V16S64};
+
+// Checks whether a type is in the list of legal register types.
+static bool isRegisterClassType(LLT Ty) {
+  if (Ty.isPointerOrPointerVector())
+    Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
+
+  return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
+         is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
+}
+
+static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
+  return [TypeIdx](const LegalityQuery &Query) {
+    return isRegisterClassType(Query.Types[TypeIdx]);
+  };
+}
+
 // If we have a truncating store or an extending load with a data size larger
 // than 32-bits, we need to reduce to a 32-bit type.
 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
@@ -416,11 +495,10 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
   if (!Ty.isVector())
     return true;
 
-  LLT EltTy = Ty.getElementType();
-  if (EltTy.isPointer())
+  if (Ty.isPointerVector())
     return true;
 
-  unsigned EltSize = EltTy.getSizeInBits();
+  unsigned EltSize = Ty.getScalarSizeInBits();
   return EltSize != 32 && EltSize != 64;
 }
 
@@ -578,52 +656,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
   };
 
-  const LLT S1 = LLT::scalar(1);
-  const LLT S8 = LLT::scalar(8);
-  const LLT S16 = LLT::scalar(16);
-  const LLT S32 = LLT::scalar(32);
-  const LLT S64 = LLT::scalar(64);
-  const LLT S128 = LLT::scalar(128);
-  const LLT S256 = LLT::scalar(256);
-  const LLT S512 = LLT::scalar(512);
-  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
-
-  const LLT V2S8 = LLT::fixed_vector(2, 8);
-  const LLT V2S16 = LLT::fixed_vector(2, 16);
-  const LLT V4S16 = LLT::fixed_vector(4, 16);
-
-  const LLT V2S32 = LLT::fixed_vector(2, 32);
-  const LLT V3S32 = LLT::fixed_vector(3, 32);
-  const LLT V4S32 = LLT::fixed_vector(4, 32);
-  const LLT V5S32 = LLT::fixed_vector(5, 32);
-  const LLT V6S32 = LLT::fixed_vector(6, 32);
-  const LLT V7S32 = LLT::fixed_vector(7, 32);
-  const LLT V8S32 = LLT::fixed_vector(8, 32);
-  const LLT V9S32 = LLT::fixed_vector(9, 32);
-  const LLT V10S32 = LLT::fixed_vector(10, 32);
-  const LLT V11S32 = LLT::fixed_vector(11, 32);
-  const LLT V12S32 = LLT::fixed_vector(12, 32);
-  const LLT V13S32 = LLT::fixed_vector(13, 32);
-  const LLT V14S32 = LLT::fixed_vector(14, 32);
-  const LLT V15S32 = LLT::fixed_vector(15, 32);
-  const LLT V16S32 = LLT::fixed_vector(16, 32);
-  const LLT V32S32 = LLT::fixed_vector(32, 32);
-
-  const LLT V2S64 = LLT::fixed_vector(2, 64);
-  const LLT V3S64 = LLT::fixed_vector(3, 64);
-  const LLT V4S64 = LLT::fixed_vector(4, 64);
-  const LLT V5S64 = LLT::fixed_vector(5, 64);
-  const LLT V6S64 = LLT::fixed_vector(6, 64);
-  const LLT V7S64 = LLT::fixed_vector(7, 64);
-  const LLT V8S64 = LLT::fixed_vector(8, 64);
-  const LLT V16S64 = LLT::fixed_vector(16, 64);
-
-  std::initializer_list<LLT> AllS32Vectors =
-    {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
-     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
-  std::initializer_list<LLT> AllS64Vectors =
-    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
-
   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
@@ -836,10 +868,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
 
   getActionDefinitionsBuilder(G_BITCAST)
-    // Don't worry about the size constraint.
-    .legalIf(all(isRegisterType(0), isRegisterType(1)))
-    .lower();
-
+      // Don't worry about the size constraint.
+      .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
+      .lower();
 
   getActionDefinitionsBuilder(G_CONSTANT)
     .legalFor({S1, S32, S64, S16, GlobalPtr,
@@ -1957,6 +1988,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
     .legalFor({S64});
 
+  getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
+
   getActionDefinitionsBuilder(G_FENCE)
     .alwaysLegal();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 015c71080d67..bc58407a7329 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -145,7 +145,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
 
     // Try to preload this argument into user SGPRs.
     if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
-        !ST.needsKernargPreloadBackwardsCompatibility() &&
         !Arg.getType()->isAggregateType())
       if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
                                            LastExplicitArgOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5323e4fc58de..b174d57bd576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4051,6 +4051,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_CONSTANT:
   case AMDGPU::G_GLOBAL_VALUE:
   case AMDGPU::G_BLOCK_ADDR:
+  case AMDGPU::G_READSTEADYCOUNTER:
   case AMDGPU::G_READCYCLECOUNTER: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index 6f1236fd3b7d..9d44b65d1698 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -139,10 +139,10 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
   const GCNSubtarget *ST =
       static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F));
 
-  // Check the GPU isn't generic. Generic is used for testing only
-  // and we don't want this pass to interfere with it.
+  // Check the GPU isn't generic or generic-hsa. Generic is used for testing
+  // only and we don't want this pass to interfere with it.
   StringRef GPUName = ST->getCPU();
-  if (GPUName.empty() || GPUName.contains("generic"))
+  if (GPUName.empty() || GPUName.starts_with("generic"))
     return false;
 
   // Try to fetch the GPU's info. If we can't, it's likely an unknown processor
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ebe0b8551b23..31077dbc0b2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1129,6 +1129,10 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector)
+    Kind = TTI::SK_PermuteSingleSrc;
 
   if (ST->hasVOP3PInsts()) {
     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
@@ -1146,6 +1150,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       }
     }
   }
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
 
   return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e78158866..79ad6ddf7861 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -314,8 +314,9 @@ public:
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
-  bool isRegOrInlineImmWithFP16InputMods() const {
-    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+  template <bool IsFake16> bool isRegOrInlineImmWithFP16InputMods() const {
+    return isRegOrInline(
+        IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16);
   }
 
   bool isRegOrInlineImmWithFP32InputMods() const {
@@ -3298,11 +3299,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
     return Match_InvalidOperand;
 
-  if ((TSFlags & SIInstrFlags::VOP3) &&
-      (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
-      getForcedEncodingSize() != 64)
-    return Match_PreferE32;
-
   if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
       Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
     // v_mac_f32/16 allow only dst_sel == DWORD;
@@ -8151,7 +8147,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
 
 // Determines which bit DST_OP_SEL occupies in the op_sel operand according to
 // the number of src operands present, then copies that bit into src0_modifiers.
-void cvtVOP3DstOpSelOnly(MCInst &Inst) {
+static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) {
   int Opc = Inst.getOpcode();
   int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
   if (OpSelIdx == -1)
@@ -8168,23 +8164,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) {
 
   unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
 
-  if ((OpSel & (1 << SrcNum)) != 0) {
-    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
-    uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
-    Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+  int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+  if (DstIdx == -1)
+    return;
+
+  const MCOperand &DstOp = Inst.getOperand(DstIdx);
+  int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+  uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+  if (DstOp.isReg() &&
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) {
+    if (AMDGPU::isHi(DstOp.getReg(), MRI))
+      ModVal |= SISrcMods::DST_OP_SEL;
+  } else {
+    if ((OpSel & (1 << SrcNum)) != 0)
+      ModVal |= SISrcMods::DST_OP_SEL;
   }
+  Inst.getOperand(ModIdx).setImm(ModVal);
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
                                    const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
                                    OptionalImmIndexMap &OptionalIdx) {
   cvtVOP3P(Inst, Operands, OptionalIdx);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
@@ -8433,8 +8440,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
     uint32_t ModVal = 0;
 
-    if ((OpSel & (1 << J)) != 0)
-      ModVal |= SISrcMods::OP_SEL_0;
+    const MCOperand &SrcOp = Inst.getOperand(OpIdx);
+    if (SrcOp.isReg() && getMRI()
+                             ->getRegClass(AMDGPU::VGPR_16RegClassID)
+                             .contains(SrcOp.getReg())) {
+      bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI());
+      if (VGPRSuffixIsHi)
+        ModVal |= SISrcMods::OP_SEL_0;
+    } else {
+      if ((OpSel & (1 << J)) != 0)
+        ModVal |= SISrcMods::OP_SEL_0;
+    }
 
     if ((OpSelHi & (1 << J)) != 0)
       ModVal |= SISrcMods::OP_SEL_1;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fba9eb53c8a8..b307865275d0 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -178,8 +178,12 @@ static DecodeStatus decodeAV10(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
 }
 
 // Decoder for Src(9-bit encoding) registers only.
-#define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth)                            \
-  DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
+template <AMDGPUDisassembler::OpWidthTy OpWidth>
+static DecodeStatus decodeSrcReg9(MCInst &Inst, unsigned Imm,
+                                  uint64_t /* Addr */,
+                                  const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, 0, Decoder);
+}
 
 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
@@ -204,22 +208,29 @@ static DecodeStatus decodeSrcAV10(MCInst &Inst, unsigned Imm,
 // will be decoded and InstPrinter will report warning. Immediate will be
 // decoded into constant of size ImmWidth, should match width of immediate used
 // by OperandType (important for floating point types).
-#define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth)           \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
-               false, ImmWidth)
-
-#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth)         \
-  DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImm9(MCInst &Inst, unsigned Imm,
+                                       uint64_t /* Addr */,
+                                       const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, ImmWidth, Decoder);
+}
 
 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
 // and decode using 'enum10' from decodeSrcOp.
-#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth,           \
-               Imm | 512, false, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImmA9(MCInst &Inst, unsigned Imm,
+                                        uint64_t /* Addr */,
+                                        const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, ImmWidth,
+                     Decoder);
+}
 
-#define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth)  \
-  DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9,         \
-               OpWidth, Imm, true, ImmWidth)
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth>
+static DecodeStatus decodeSrcRegOrImmDeferred9(MCInst &Inst, unsigned Imm,
+                                               uint64_t /* Addr */,
+                                               const MCDisassembler *Decoder) {
+  return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, true, ImmWidth, Decoder);
+}
 
 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
 // when RegisterClass is used as an operand. Most often used for destination
@@ -255,51 +266,6 @@ DECODE_OPERAND_REG_8(AReg_256)
 DECODE_OPERAND_REG_8(AReg_512)
 DECODE_OPERAND_REG_8(AReg_1024)
 
-// Decoders for register only source RegisterOperands that use use 9-bit Src
-// encoding: 'decodeOperand_<RegClass>'.
-
-DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
-DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
-DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
-DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
-DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
-
-// Decoders for register or immediate RegisterOperands that use 9-bit Src
-// encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
-
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
-
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
-DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32)
-
 static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
                                                uint64_t /*Addr*/,
                                                const MCDisassembler *Decoder) {
@@ -913,6 +879,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI,
   return Modifiers;
 }
 
+// Instructions decode the op_sel/suffix bits into the src_modifier
+// operands. Copy those bits into the src operands for true16 VGPRs.
+void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  const MCRegisterClass &ConversionRC =
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
+  constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
+      {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::DST_OP_SEL}}};
+  for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
+    if (OpIdx == -1 || OpModsIdx == -1)
+      continue;
+    MCOperand &Op = MI.getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (!ConversionRC.contains(Op.getReg()))
+      continue;
+    unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
+    const MCOperand &OpMods = MI.getOperand(OpModsIdx);
+    unsigned ModVal = OpMods.getImm();
+    if (ModVal & OpSelMask) { // isHi
+      unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
+      Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
+    }
+  }
+}
+
 // MAC opcodes have special old and src2 operands.
 // src2 is tied to dst, while old is not tied (but assumed to be).
 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
@@ -968,6 +969,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+      convertTrue16OpSel(MI);
       auto Mods = collectVOPModifiers(MI);
       insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
                            AMDGPU::OpName::op_sel);
@@ -991,6 +993,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (isMacDPP(MI))
     convertMacDPPInst(MI);
 
+  convertTrue16OpSel(MI);
+
   int VDstInIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
   if (VDstInIdx != -1)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5a89b30f6fb3..02feaf553c0c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -203,6 +203,7 @@ public:
   DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
   DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
   void convertMacDPPInst(MCInst &MI) const;
+  void convertTrue16OpSel(MCInst &MI) const;
 
   enum OpWidthTy {
     OPW32,
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 96af1a6aab3d..4671e03d43b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -204,6 +204,11 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
   FeatureISAVersion9_4_2.Features
 >;
 
+// [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c]
+def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
+  FeatureISAVersion9_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX10.
 //===----------------------------------------------------------------------===//
@@ -252,6 +257,16 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel,
   FeatureISAVersion10_3_0.Features
 >;
 
+// [gfx1010, gfx1011, gfx1012, gfx1013]
+def : ProcessorModel<"gfx10.1-generic", GFX10SpeedModel,
+  FeatureISAVersion10_1_Generic.Features
+>;
+
+// [gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036]
+def : ProcessorModel<"gfx10.3-generic", GFX10SpeedModel,
+  FeatureISAVersion10_3_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX11.
 //===----------------------------------------------------------------------===//
@@ -280,10 +295,17 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
   FeatureISAVersion11_5_1.Features
 >;
 
+// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151]
+def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
+  FeatureISAVersion11_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX12.
 //===----------------------------------------------------------------------===//
 
+// TODO: gfx12-generic ?
+
 def : ProcessorModel<"gfx1200", GFX12SpeedModel,
   FeatureISAVersion12.Features
 >;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4f8eeaaf500b..c8e1b153e473 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -224,6 +224,8 @@ protected:
   bool HasVALUTransUseHazard = false;
   bool HasForceStoreSC0SC1 = false;
 
+  bool RequiresCOV6 = false;
+
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
 
@@ -1165,6 +1167,8 @@ public:
 
   bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
 
+  bool requiresCodeObjectV6() const { return RequiresCOV6; }
+
   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
   /// Return if operations acting on VGPR tuples require even alignment.
@@ -1254,12 +1258,6 @@ public:
   // \returns true if preloading kernel arguments is supported.
   bool hasKernargPreload() const { return KernargPreload; }
 
-  // \returns true if we need to generate backwards compatible code when
-  // preloading kernel arguments.
-  bool needsKernargPreloadBackwardsCompatibility() const {
-    return hasKernargPreload() && !hasGFX940Insts();
-  }
-
   // \returns true if the target has split barriers feature
   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index abfa4a3531e8..4ab3aa5a0240 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -709,6 +709,7 @@ bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
   return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
          (Desc.TSFlags & SIInstrFlags::VOPC) &&
+         !isVOPCAsmOnly(Desc.getOpcode()) &&
          (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
           Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
 }
@@ -725,7 +726,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   // printOperandAndIntInputMods will be called instead
   if ((OpNo == 0 ||
        (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
-      (Desc.TSFlags & SIInstrFlags::VOPC) &&
+      (Desc.TSFlags & SIInstrFlags::VOPC) && !isVOPCAsmOnly(Desc.getOpcode()) &&
       (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
        Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
     printDefaultVccOperand(true, STI, O);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5e9b1674d87d..4742b0b3e52e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -115,6 +115,10 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:  AK = GK_GFX10_1_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:  AK = GK_GFX10_3_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:    AK = GK_GFX11_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;
   default:                                 AK = GK_NONE;    break;
   }
@@ -193,6 +197,10 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
+  case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
+  case GK_GFX10_1_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
+  case GK_GFX10_3_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC;
+  case GK_GFX11_GENERIC:    return ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
   // clang-format on
@@ -659,6 +667,24 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
   unsigned Flags = getEFlagsV4();
 
   unsigned Version = ForceGenericVersion;
+  if (!Version) {
+    switch (parseArchAMDGCN(STI.getCPU())) {
+    case AMDGPU::GK_GFX9_GENERIC:
+      Version = GenericVersion::GFX9;
+      break;
+    case AMDGPU::GK_GFX10_1_GENERIC:
+      Version = GenericVersion::GFX10_1;
+      break;
+    case AMDGPU::GK_GFX10_3_GENERIC:
+      Version = GenericVersion::GFX10_3;
+      break;
+    case AMDGPU::GK_GFX11_GENERIC:
+      Version = GenericVersion::GFX11;
+      break;
+    default:
+      break;
+    }
+  }
 
   // Versions start at 1.
   if (Version) {
@@ -756,18 +782,26 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
 }
 
 bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
-  for (int i = 0; i < 64; ++i) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
+  const char *TrapInstr = TrapEnabled ? "\ts_trap 2" : "\ts_endpgm";
+  OS << TrapInstr
+     << " ; Trap with incompatible firmware that doesn't "
+        "support preloading kernel arguments.\n";
+  for (int i = 0; i < 63; ++i) {
     OS << "\ts_nop 0\n";
   }
   return true;
 }
 
 bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
   const uint32_t Encoded_s_nop = 0xbf800000;
+  const uint32_t Encoded_s_trap = 0xbf920002;
+  const uint32_t Encoded_s_endpgm = 0xbf810000;
+  const uint32_t TrapInstr = TrapEnabled ? Encoded_s_trap : Encoded_s_endpgm;
   MCStreamer &OS = getStreamer();
-  for (int i = 0; i < 64; ++i) {
+  OS.emitInt32(TrapInstr);
+  for (int i = 0; i < 63; ++i) {
     OS.emitInt32(Encoded_s_nop);
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index ad5f27a33fcb..5aa80ff578c6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -89,7 +89,8 @@ public:
   virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
 
   /// \returns True on success, false on failure.
-  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
+  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                        bool TrapEnabled) {
     return true;
   }
 
@@ -146,7 +147,8 @@ public:
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
@@ -200,7 +202,8 @@ public:
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 19596d53b453..ca6728cf3ddc 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -105,10 +105,16 @@ enum : uint64_t {
   WQM = UINT64_C(1) << 35,
   DisableWQM = UINT64_C(1) << 36,
   Gather4 = UINT64_C(1) << 37,
-  SOPK_ZEXT = UINT64_C(1) << 38,
+
+  // Reserved, must be 0.
+  Reserved0 = UINT64_C(1) << 38,
+
   SCALAR_STORE = UINT64_C(1) << 39,
   FIXED_SIZE = UINT64_C(1) << 40,
-  VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+
+  // Reserved, must be 0.
+  Reserved1 = UINT64_C(1) << 41,
+
   VOP3_OPSEL = UINT64_C(1) << 42,
   maybeAtomic = UINT64_C(1) << 43,
   renamedInGFX9 = UINT64_C(1) << 44,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc61500..8bf05682cbe7 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand(
   int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
-  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+  const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
 
-  if (!isUseSafeToFold(*UseMI, UseOp))
+  if (!isUseSafeToFold(*UseMI, *UseOp))
     return;
 
   // FIXME: Fold operands with subregs.
-  if (UseOp.isReg() && OpToFold.isReg() &&
-      (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
+  if (UseOp->isReg() && OpToFold.isReg() &&
+      (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
     return;
 
   // Special case for REG_SEQUENCE: We can't fold literals into
@@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand(
     if (MovOp == AMDGPU::COPY)
       return;
 
-    UseMI->setDesc(TII->get(MovOp));
     MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
     MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
     while (ImpOpI != ImpOpE) {
@@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand(
       ImpOpI++;
       UseMI->removeOperand(UseMI->getOperandNo(Tmp));
     }
+    UseMI->setDesc(TII->get(MovOp));
+
+    if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+      const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+      MachineOperand NewSrcOp(SrcOp);
+      MachineFunction *MF = UseMI->getParent()->getParent();
+      UseMI->removeOperand(1);
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+      UseMI->addOperand(NewSrcOp);                          // src0
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+      UseOpIdx = 2;
+      UseOp = &UseMI->getOperand(UseOpIdx);
+    }
     CopiesToReplace.push_back(UseMI);
   } else {
     if (UseMI->isCopy() && OpToFold.isReg() &&
@@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand(
 
     // Don't fold into target independent nodes.  Target independent opcodes
     // don't have defined register classes.
-    if (UseDesc.isVariadic() || UseOp.isImplicit() ||
+    if (UseDesc.isVariadic() || UseOp->isImplicit() ||
         UseDesc.operands()[UseOpIdx].RegClass == -1)
       return;
   }
@@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand(
       TRI->getRegClass(FoldDesc.operands()[0].RegClass);
 
   // Split 64-bit constants into 32-bits for folding.
-  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
-    Register UseReg = UseOp.getReg();
+  if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
+    Register UseReg = UseOp->getReg();
     const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
     if (AMDGPU::getRegBitWidth(*UseRC) != 64)
       return;
 
     APInt Imm(64, OpToFold.getImm());
-    if (UseOp.getSubReg() == AMDGPU::sub0) {
+    if (UseOp->getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
-      assert(UseOp.getSubReg() == AMDGPU::sub1);
+      assert(UseOp->getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a64a9e608f21..56f0e7164239 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -468,6 +468,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
+  if (Subtarget->hasSMemRealTime() ||
+      Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
+    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
 
   if (Subtarget->has16BitInsts()) {
@@ -2826,8 +2830,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
-    if (IsKernel && Subtarget->hasKernargPreload() &&
-        !Subtarget->needsKernargPreloadBackwardsCompatibility())
+    if (IsKernel && Subtarget->hasKernargPreload())
       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
 
     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index ab536f8f49d5..bdefcae278ef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -69,10 +69,6 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   field bit Gather4 = 0;
 
-  // Most sopk treat the immediate as a signed 16-bit, however some
-  // use it as unsigned.
-  field bit SOPKZext = 0;
-
   // This is an s_store_dword* instruction that requires a cache flush
   // on wave termination. It is necessary to distinguish from mayStore
   // SMEM instructions like the cache flush ones.
@@ -82,10 +78,6 @@ class InstSI <dag outs, dag ins, string asm = "",
   // instruction size.
   field bit FixedSize = 0;
 
-  // This bit tells the assembler to use the 32-bit encoding in case it
-  // is unable to infer the encoding from the operands.
-  field bit VOPAsmPrefer32Bit = 0;
-
   // This bit indicates that this is a VOP3 opcode which supports op_sel
   // modifier.
   field bit VOP3_OPSEL = 0;
@@ -209,10 +201,15 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{36} = DisableWQM;
   let TSFlags{37} = Gather4;
 
-  let TSFlags{38} = SOPKZext;
+  // Reserved, must be 0.
+  let TSFlags{38} = 0;
+
   let TSFlags{39} = ScalarStore;
   let TSFlags{40} = FixedSize;
-  let TSFlags{41} = VOPAsmPrefer32Bit;
+
+  // Reserved, must be 0.
+  let TSFlags{41} = 0;
+
   let TSFlags{42} = VOP3_OPSEL;
 
   let TSFlags{43} = maybeAtomic;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c7628bd35430..f5ec831234f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4918,7 +4918,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     } else {
       uint64_t Imm = Op->getImm();
-      if (sopkIsZext(MI)) {
+      if (sopkIsZext(Opcode)) {
         if (!isUInt<16>(Imm)) {
           ErrInfo = "invalid immediate for SOPK instruction";
           return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2838a5c0791f..7a6c28421c8d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -842,12 +842,13 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
   }
 
-  static bool sopkIsZext(const MachineInstr &MI) {
-    return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
-  }
-
-  bool sopkIsZext(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
+  // Most sopk treat the immediate as a signed 16-bit, however some
+  // use it as unsigned.
+  static bool sopkIsZext(unsigned Opcode) {
+    return Opcode == AMDGPU::S_CMPK_EQ_U32 || Opcode == AMDGPU::S_CMPK_LG_U32 ||
+           Opcode == AMDGPU::S_CMPK_GT_U32 || Opcode == AMDGPU::S_CMPK_GE_U32 ||
+           Opcode == AMDGPU::S_CMPK_LT_U32 || Opcode == AMDGPU::S_CMPK_LE_U32 ||
+           Opcode == AMDGPU::S_GETREG_B32;
   }
 
   /// \returns true if this is an s_store_dword* instruction. This is more
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7edec5a7a550..22599773d562 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
-def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+class FP16VCSrcInputModsMatchClass<bit IsFake16>
+    : FPVCSrcInputModsMatchClass<16> {
+  let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods",
+                 "RegOrInlineImmWithFPT16InputMods");
+  let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" #
+                        !if(IsFake16, "true", "false") # ">";
+}
 def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
 
 class InputMods <AsmOperandClass matchClass> : Operand <i32> {
@@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
-def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+class FP16VCSrcInputMods<bit IsFake16>
+  : FPInputMods<FP16VCSrcInputModsMatchClass<IsFake16>>;
 def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
 
 class IntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
 }
 
 // Return type of input modifiers operand for specified input operand for DPP
-class getSrcModVOP3DPP <ValueType VT> {
+class getSrcModVOP3DPP <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
            !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                FP16VCSrcInputMods, FP32VCSrcInputMods),
+                FP16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
            Int32VCSrcInputMods);
 }
 
@@ -2450,6 +2457,10 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
 class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let IsTrue16 = 1;
   let IsRealTrue16 = 1;
+
+  let HasOpSel = 1;
+  let HasModifiers = 1; // All instructions at least have OpSel.
+
   // Most DstVT are 16-bit, but not all.
   let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -2461,6 +2472,10 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src0ModVOP3DPP = getSrcModVOP3DPP<Src0VT, 0 /*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
 
   let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143a75e5..52d6fe6c7ba5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -552,14 +552,10 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
                                          const SIRegisterInfo &TRI) {
   if (ScavengeFI)
     return *ScavengeFI;
-  if (isBottomOfStack()) {
-    ScavengeFI = MFI.CreateFixedObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
-  } else {
-    ScavengeFI = MFI.CreateStackObject(
-        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
-        TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
-  }
+
+  ScavengeFI =
+      MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+                            TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
   return *ScavengeFI;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 84b9330ef963..f62e808b33e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2030,8 +2030,11 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+      // The order of invalidates matter here. We must invalidate "outer in"
+      // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
+      // invalidated.
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 1a22b777aeda..3664535b3259 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2287,9 +2287,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           if (FrameReg)
             FIOp.ChangeToRegister(FrameReg, false);
 
-          if (!Offset)
-            return false;
-
           MachineOperand *OffsetOp =
             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
           int64_t NewOffset = Offset + OffsetOp->getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c9dbe02037ef..d4a1e8d185a1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1101,96 +1101,106 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
-class RegOrImmOperand <string RegisterClassName, string OperandTypeName,
-                       string decoderImmSize>
-  : RegisterOperand<!cast<RegisterClass>(RegisterClassName)> {
+class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
+  : RegisterOperand<RegClass> {
     let OperandNamespace = "AMDGPU";
     let OperandType = OperandTypeName;
     let ParserMatchClass = RegImmMatcher<!subst("_Deferred", "", NAME)>;
-    let DecoderMethod = "decodeOperand_" # RegisterClassName # decoderImmSize;
 }
 
 //===----------------------------------------------------------------------===//
 //  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
-def SSrc_b16 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def SSrc_f16 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_FP16", "_Imm16">;
-def SSrc_b32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
-def SSrc_f32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_IMM_FP32", "_Imm32">;
-def SSrc_b64 : RegOrImmOperand <"SReg_64", "OPERAND_REG_IMM_INT64", "_Imm64">;
+class SrcRegOrImm9<RegisterClass regClass, string opWidth, string operandType,
+                   int immWidth> : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImm9<AMDGPUDisassembler::" # opWidth #
+                      ", " # immWidth # ">";
+}
+
+def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_INT16", 16>;
+def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_FP16", 16>;
+def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
+def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_IMM_FP32", 32>;
+def SSrc_b64 : SrcRegOrImm9 <SReg_64, "OPW64", "OPERAND_REG_IMM_INT64", 64>;
 
-def SSrcOrLds_b32 : RegOrImmOperand <"SRegOrLds_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
+def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  SSrc_32_Deferred Operands with an SGPR or a 32-bit immediate for use with
 //  FMAMK/FMAAK
 //===----------------------------------------------------------------------===//
 
-def SSrc_f32_Deferred : RegOrImmOperand<"SReg_32", "OPERAND_REG_IMM_FP32_DEFERRED", "_Deferred_Imm32">;
+class SrcRegOrImmDeferred9<RegisterClass regClass, string opWidth,
+                           string operandType, int immWidth>
+    : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImmDeferred9<AMDGPUDisassembler::" #
+                      opWidth # ", " # immWidth # ">";
+}
+
+def SSrc_f32_Deferred : SrcRegOrImmDeferred9<SReg_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32>;
 
 //===----------------------------------------------------------------------===//
 //  SCSrc_* Operands with an SGPR or a inline constant
 //===----------------------------------------------------------------------===//
 
-def SCSrc_b32 : RegOrImmOperand <"SReg_32", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def SCSrc_b64 : RegOrImmOperand <"SReg_64", "OPERAND_REG_INLINE_C_INT64", "_Imm64">;
+def SCSrc_b32 : SrcRegOrImm9 <SReg_32, "OPW32", "OPERAND_REG_INLINE_C_INT32", 32>;
+def SCSrc_b64 : SrcRegOrImm9 <SReg_64, "OPW64", "OPERAND_REG_INLINE_C_INT64", 64>;
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
 // The current and temporary future default used case for VOP3.
-def VSrc_b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def VSrc_f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_FP16", "_Imm16">;
+def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_INT16", 16>;
+def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_FP16", 16>;
 
 // True16 VOP3 operands.
-def VSrcT_b16 : RegOrImmOperand <"VS_16", "OPERAND_REG_IMM_INT16", "_Imm16"> {
+def VSrcT_b16 : RegOrImmOperand <VS_16, "OPERAND_REG_IMM_INT16"> {
   let EncoderMethod = "getMachineOpValueT16";
   let DecoderMethod = "decodeOperand_VSrcT16";
 }
-def VSrcT_f16 : RegOrImmOperand <"VS_16", "OPERAND_REG_IMM_FP16", "_Imm16"> {
+def VSrcT_f16 : RegOrImmOperand <VS_16, "OPERAND_REG_IMM_FP16"> {
   let EncoderMethod = "getMachineOpValueT16";
   let DecoderMethod = "decodeOperand_VSrcT16";
 }
 
 // True16 VOP1/2/C operands.
-def VSrcT_b16_Lo128 : RegOrImmOperand <"VS_16_Lo128", "OPERAND_REG_IMM_INT16", "_Imm16"> {
+def VSrcT_b16_Lo128 : RegOrImmOperand <VS_16_Lo128, "OPERAND_REG_IMM_INT16"> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
   let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
 }
-def VSrcT_f16_Lo128 : RegOrImmOperand <"VS_16_Lo128", "OPERAND_REG_IMM_FP16", "_Imm16"> {
+def VSrcT_f16_Lo128 : RegOrImmOperand <VS_16_Lo128, "OPERAND_REG_IMM_FP16"> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
   let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
 }
 
 // The current and temporary future default used case for fake VOP1/2/C.
 // For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only.
-def VSrcFake16_b16_Lo128 : RegOrImmOperand <"VS_32_Lo128", "OPERAND_REG_IMM_INT16", "_Imm16">;
-def VSrcFake16_f16_Lo128 : RegOrImmOperand <"VS_32_Lo128", "OPERAND_REG_IMM_FP16", "_Imm16">;
-
-def VSrc_b32 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_INT32", "_Imm32">;
-def VSrc_f32 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_FP32", "_Imm32">;
-def VSrc_v2b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_V2INT16", "_ImmV2I16">;
-def VSrc_v2f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_IMM_V2FP16", "_ImmV2F16">;
-def VSrc_b64 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_INT64", "_Imm64">;
-def VSrc_f64 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_FP64", "_Imm64"> {
+def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_INT16", 16>;
+def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_FP16", 16>;
+
+def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_INT32", 32>;
+def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_FP32", 32>;
+def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_V2INT16", 32>;
+def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_IMM_V2FP16", 16>;
+def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_INT64", 64>;
+def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_FP64", 64> {
   let DecoderMethod = "decodeOperand_VSrc_f64";
 }
-def VSrc_v2b32 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_V2INT32", "_Imm32">;
-def VSrc_v2f32 : RegOrImmOperand <"VS_64", "OPERAND_REG_IMM_V2FP32", "_Imm32">;
+def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_V2INT32", 32>;
+def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPW64", "OPERAND_REG_IMM_V2FP32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
 //  with FMAMK/FMAAK
 //===----------------------------------------------------------------------===//
 
-def VSrc_f16_Deferred : RegOrImmOperand<"VS_32", "OPERAND_REG_IMM_FP16_DEFERRED", "_Deferred_Imm16">;
-def VSrc_f32_Deferred : RegOrImmOperand<"VS_32", "OPERAND_REG_IMM_FP32_DEFERRED", "_Deferred_Imm32">;
+def VSrc_f16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16>;
+def VSrc_f32_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32>;
 
-def VSrcFake16_f16_Lo128_Deferred : RegOrImmOperand<"VS_32_Lo128",
-                                                    "OPERAND_REG_IMM_FP16_DEFERRED",
-                                                    "_Deferred_Imm16">;
+def VSrcFake16_f16_Lo128_Deferred
+  : SrcRegOrImmDeferred9<VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16>;
 
 //===----------------------------------------------------------------------===//
 //  VRegSrc_* Operands with a VGPR
@@ -1198,25 +1208,15 @@ def VSrcFake16_f16_Lo128_Deferred : RegOrImmOperand<"VS_32_Lo128",
 
 // This is for operands with the enum(9), VSrc encoding restriction,
 // but only allows VGPRs.
-def VRegSrc_32 : RegisterOperand<VGPR_32> {
-  let DecoderMethod = "decodeOperand_VGPR_32";
-}
-
-def VRegSrc_64 : RegisterOperand<VReg_64> {
-  let DecoderMethod = "decodeOperand_VReg_64";
-}
-
-def VRegSrc_128 : RegisterOperand<VReg_128> {
-  let DecoderMethod = "decodeOperand_VReg_128";
-}
-
-def VRegSrc_256 : RegisterOperand<VReg_256> {
-  let DecoderMethod = "decodeOperand_VReg_256";
+class SrcReg9<RegisterClass regClass, string width> : RegisterOperand<regClass> {
+  let DecoderMethod = "decodeSrcReg9<AMDGPUDisassembler::" # width # ">";
 }
 
-def VRegOrLdsSrc_32 : RegisterOperand<VRegOrLds_32> {
-  let DecoderMethod = "decodeOperand_VRegOrLds_32";
-}
+def VRegSrc_32 : SrcReg9<VGPR_32, "OPW32">;
+def VRegSrc_64 : SrcReg9<VReg_64, "OPW64">;
+def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
+def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
+def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
 
 //===----------------------------------------------------------------------===//
 // VGPRSrc_*
@@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
 }
 
+// True 16 operands.
+def VGPRSrc_16 : RegisterOperand<VGPR_16> {
+  let DecoderMethod = "DecodeVGPR_16RegisterClass";
+  let EncoderMethod = "getMachineOpValueT16";
+}
+
 //===----------------------------------------------------------------------===//
 //  ASrc_* Operands with an AccVGPR
 //===----------------------------------------------------------------------===//
@@ -1251,30 +1257,30 @@ def ARegSrc_32 : AVOperand<AGPR_32, "decodeSrcA9", "OPW32">;
 //  VCSrc_* Operands with an SGPR, VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VCSrc_b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_INT16", "_Imm16">;
-def VCSrc_f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VCSrc_b32 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VCSrc_f32 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VCSrc_v2b16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_V2INT16", "_ImmV2I16">;
-def VCSrc_v2f16 : RegOrImmOperand <"VS_32", "OPERAND_REG_INLINE_C_V2FP16", "_ImmV2F16">;
+def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_INT16", 16>;
+def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_V2INT16", 32>;
+def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPW32", "OPERAND_REG_INLINE_C_V2FP16", 16>;
 
 //===----------------------------------------------------------------------===//
 //  VISrc_* Operands with a VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VISrc_64_f16 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VISrc_64_b32 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_64_f64 : RegOrImmOperand <"VReg_64", "OPERAND_REG_INLINE_C_FP64", "_Imm64">;
-def VISrc_128_f16 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_FP16", "_Imm16">;
-def VISrc_128_b32 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_128_f32 : RegOrImmOperand <"VReg_128", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_256_b32 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_256_f32 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_256_f64 : RegOrImmOperand <"VReg_256", "OPERAND_REG_INLINE_C_FP64", "_Imm64">;
-def VISrc_512_b32 : RegOrImmOperand <"VReg_512", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_512_f32 : RegOrImmOperand <"VReg_512", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
-def VISrc_1024_b32 : RegOrImmOperand <"VReg_1024", "OPERAND_REG_INLINE_C_INT32", "_Imm32">;
-def VISrc_1024_f32 : RegOrImmOperand <"VReg_1024", "OPERAND_REG_INLINE_C_FP32", "_Imm32">;
+def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPW64", "OPERAND_REG_INLINE_C_FP64", 64>;
+def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_FP16", 16>;
+def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPW128", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPW256", "OPERAND_REG_INLINE_C_FP64", 64>;
+def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPW512", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPW512", "OPERAND_REG_INLINE_C_FP32", 32>;
+def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPW1024", "OPERAND_REG_INLINE_C_INT32", 32>;
+def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPW1024", "OPERAND_REG_INLINE_C_FP32", 32>;
 
 //===----------------------------------------------------------------------===//
 //  AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
@@ -1306,11 +1312,18 @@ def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">;
 //  ACSrc_* Operands with an AGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def AISrc_64_f64 : RegOrImmOperand <"AReg_64", "OPERAND_REG_INLINE_AC_FP64", "_Imm64">;
-def AISrc_128_f32 : RegOrImmOperand <"AReg_128", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_128_b32 : RegOrImmOperand <"AReg_128", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
-def AISrc_256_f64 : RegOrImmOperand <"AReg_256", "OPERAND_REG_INLINE_AC_FP64", "_Imm64">;
-def AISrc_512_f32 : RegOrImmOperand <"AReg_512", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_512_b32 : RegOrImmOperand <"AReg_512", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
-def AISrc_1024_f32 : RegOrImmOperand <"AReg_1024", "OPERAND_REG_INLINE_AC_FP32", "_Imm32">;
-def AISrc_1024_b32 : RegOrImmOperand <"AReg_1024", "OPERAND_REG_INLINE_AC_INT32", "_Imm32">;
+class SrcRegOrImmA9<RegisterClass regClass, string opWidth, string operandType,
+                    int immWidth>
+    : RegOrImmOperand<regClass, operandType> {
+  let DecoderMethod = "decodeSrcRegOrImmA9<AMDGPUDisassembler::" # opWidth #
+                      ", " # immWidth # ">";
+}
+
+def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPW64", "OPERAND_REG_INLINE_AC_FP64", 64>;
+def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPW128", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPW128", "OPERAND_REG_INLINE_AC_INT32", 32>;
+def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPW256", "OPERAND_REG_INLINE_AC_FP64", 64>;
+def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPW512", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPW512", "OPERAND_REG_INLINE_AC_INT32", 32>;
+def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPW1024", "OPERAND_REG_INLINE_AC_FP32", 32>;
+def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPW1024", "OPERAND_REG_INLINE_AC_INT32", 32>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index d290dd82b760..3c6f6ddfd89d 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -251,9 +251,9 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
 
   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 
-  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
-      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
-    if (!TII->sopkIsZext(SOPKOpc))
+  if ((SIInstrInfo::sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
+      (!SIInstrInfo::sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
+    if (!SIInstrInfo::sopkIsZext(SOPKOpc))
       Src1.setImm(SignExtend64(Src1.getImm(), 32));
     MI.setDesc(NewDesc);
   }
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index f3096962e2f3..29651a839039 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1068,6 +1068,20 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+let OtherPredicates = [HasSMemRealTime] in {
+def : GCNPat <
+  (i64 (readsteadycounter)),
+  (S_MEMREALTIME)
+>;
+} // let OtherPredicates = [HasSMemRealTime]
+
+let SubtargetPredicate = isGFX11Plus in {
+def : GCNPat <
+  (i64 (readsteadycounter)),
+  (S_SENDMSG_RTN_B64 (i32 /*MSG_RTN_GET_REALTIME=*/0x83))
+>;
+} // let SubtargetPredicate = [isGFX11Plus]
+
 def i32imm_zero : TImmLeaf <i32, [{
   return Imm == 0;
 }]>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c8e8ad2034dc..1486df04ed95 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1078,14 +1078,12 @@ def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>;
 def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>;
 def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>;
 
-let SOPKZext = 1 in {
 def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>;
 def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>;
 def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>;
 def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>;
 def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
 def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
-} // End SOPKZext = 1
 } // End isCompare = 1
 
 let isCommutable = 1, DisableEncoding = "$src0",
@@ -1111,7 +1109,6 @@ def S_GETREG_B32 : SOPK_Pseudo <
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
   "$sdst, $simm16",
   [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
-  let SOPKZext = 1;
   let hasSideEffects = 1;
 }
 
@@ -2620,7 +2617,8 @@ defm S_DECPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x039>;
 defm S_TTRACEDATA                 : SOPP_Real_32_gfx11_gfx12<0x03a>;
 defm S_TTRACEDATA_IMM             : SOPP_Real_32_gfx11_gfx12<0x03b>;
 defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
-defm S_BARRIER                    : SOPP_Real_32_gfx11_gfx12<0x03d>;
+
+defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
 
 //===----------------------------------------------------------------------===//
 // SOPP - GFX1150, GFX12.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 33335ac75df7..800dfcf3076d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -342,6 +342,14 @@ struct VOPC64DPPInfo {
   uint16_t Opcode;
 };
 
+struct VOPCDPPAsmOnlyInfo {
+  uint16_t Opcode;
+};
+
+struct VOP3CDPPAsmOnlyInfo {
+  uint16_t Opcode;
+};
+
 struct VOPDComponentInfo {
   uint16_t BaseVOP;
   uint16_t VOPDOp;
@@ -376,6 +384,10 @@ struct VOPTrue16Info {
 #define GET_VOPC64DPPTable_IMPL
 #define GET_VOPC64DPP8Table_DECL
 #define GET_VOPC64DPP8Table_IMPL
+#define GET_VOPCAsmOnlyInfoTable_DECL
+#define GET_VOPCAsmOnlyInfoTable_IMPL
+#define GET_VOP3CAsmOnlyInfoTable_DECL
+#define GET_VOP3CAsmOnlyInfoTable_IMPL
 #define GET_VOPDComponentTable_DECL
 #define GET_VOPDComponentTable_IMPL
 #define GET_VOPDPairs_DECL
@@ -477,6 +489,10 @@ bool isVOPC64DPP(unsigned Opc) {
   return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
 }
 
+bool isVOPCAsmOnly(unsigned Opc) {
+  return isVOPCAsmOnlyOpcodeHelper(Opc) || isVOP3CAsmOnlyOpcodeHelper(Opc);
+}
+
 bool getMAIIsDGEMM(unsigned Opc) {
   const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
   return Info ? Info->is_dgemm : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f24b9f0e3615..b56025f55519 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -42,6 +42,17 @@ namespace AMDGPU {
 
 struct IsaVersion;
 
+/// Generic target versions emitted by this version of LLVM.
+///
+/// These numbers are incremented every time a codegen breaking change occurs
+/// within a generic family.
+namespace GenericVersion {
+static constexpr unsigned GFX9 = 1;
+static constexpr unsigned GFX10_1 = 1;
+static constexpr unsigned GFX10_3 = 1;
+static constexpr unsigned GFX11 = 1;
+} // namespace GenericVersion
+
 enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 };
 
 /// \returns True if \p STI is AMDHSA.
@@ -489,6 +500,9 @@ bool getVOP3IsSingle(unsigned Opc);
 LLVM_READONLY
 bool isVOPC64DPP(unsigned Opc);
 
+LLVM_READONLY
+bool isVOPCAsmOnly(unsigned Opc);
+
 /// Returns true if MAI operation is a double precision GEMM.
 LLVM_READONLY
 bool getMAIIsDGEMM(unsigned Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 58b67b21e274..41a03bb1e73c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -217,9 +217,7 @@ def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> {
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-let VOPAsmPrefer32Bit = 1 in {
 defm V_NOP : VOP1Inst <"v_nop", VOP_NOP_PROFILE>;
-}
 
 def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> {
   let InsVOPDX = (ins Src0RC32:$src0X);
@@ -368,9 +366,7 @@ defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amd
 defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
 } // End isReMaterializable = 1
 
-let VOPAsmPrefer32Bit = 1 in {
 defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
-}
 
 // Restrict src0 to be VGPR
 def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 861c9f75e866..fe52a0e39e4f 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1165,6 +1165,8 @@ class VOPC_DPPe_Common<bits<8> op> : Enc64 {
 class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
     : VOP_DPP_Base<OpName, P, P.InsDPP16, " " #P.AsmDPP16>,
       VOPC_DPPe_Common<op> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
   bits<2> src0_modifiers;
   bits<8> src0;
   bits<2> src1_modifiers;
@@ -1194,6 +1196,8 @@ class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
 class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
     : VOP_DPP8_Base<OpName, P, P.InsDPP8, " " #P.AsmDPP8>,
       VOPC_DPPe_Common<op> {
+  Instruction Opcode = !cast<Instruction>(NAME);
+
   bits<8> src0;
   bits<24> dpp8;
   bits<9> fi;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 20d7c88fb7e5..f2bb58ed4c3b 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -16,7 +16,6 @@ class LetDummies {
   bit isMoveImm;
   bit isReMaterializable;
   bit isAsCheapAsAMove;
-  bit VOPAsmPrefer32Bit;
   bit FPDPRounding;
   Predicate SubtargetPredicate;
   string Constraints;
@@ -1669,6 +1668,19 @@ class VOPC64Table <string Format> : GenericTable {
 def VOPC64DPPTable : VOPC64Table<"DPP">;
 def VOPC64DPP8Table : VOPC64Table<"DPP8">;
 
+class AsmOnlyInfoTable <string Format, string Class>: GenericTable {
+  let FilterClass = Class;
+  let FilterClassField = "isAsmParserOnly";
+  let CppTypeName = Format # "DPPAsmOnlyInfo";
+  let Fields = ["Opcode"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "is" # Format # "AsmOnlyOpcodeHelper";
+}
+
+def VOPCAsmOnlyInfoTable : AsmOnlyInfoTable <"VOPC", "VOPC_DPPe_Common">;
+def VOP3CAsmOnlyInfoTable : AsmOnlyInfoTable <"VOP3C", "VOP3_DPPe_Common_Base">;
+
 def VOPTrue16Table : GenericTable {
   let FilterClass = "VOP_Pseudo";
   let CppTypeName = "VOPTrue16Info";
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b5c4a8a322ea..b98006ed0cb3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6702,8 +6702,8 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
 
     // If the shift amount is greater than 32 or has a greater bitwidth than 64
     // then do the default optimisation
-    if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
-        (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
+    if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
+        (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
       return SDValue();
 
     // Extract the lower 32 bits of the shift amount if it's not an i32
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 67187c4b1b3a..c5199aab7527 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -223,7 +223,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
         .libcallForCartesianProduct({s32, s64}, {s32});
 
-    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).libcall();
+    getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV})
+        .libcall();
   }
 
   // Just expand whatever loads and stores are left.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index a1b11016de33..3be894ad3bef 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1214,6 +1214,10 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args) {
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  // Treat extractsubvector as single op permutation.
+  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
+  if (IsExtractSubvector)
+    Kind = TTI::SK_PermuteSingleSrc;
   if (ST->hasNEON()) {
     if (Kind == TTI::SK_Broadcast) {
       static const CostTblEntry NEONDupTbl[] = {
@@ -1308,6 +1312,9 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
+  // Restore optimal kind.
+  if (IsExtractSubvector)
+    Kind = TTI::SK_ExtractSubvector;
   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                      : 1;
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 90697c6645be..0d1eef60c3b5 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -229,6 +229,7 @@ public:
     return StringSwitch<bool>(Name.lower())
         .Case("if", true)
         .Case("call", true)
+        .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
         .Case("*", true)
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 7d443a344901..690d53420718 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -622,9 +622,9 @@ class CALLX<string OpcodeStr>
                    (ins GPR:$BrDst),
                    !strconcat(OpcodeStr, " $BrDst"),
                    []> {
-  bits<32> BrDst;
+  bits<4> BrDst;
 
-  let Inst{31-0} = BrDst;
+  let Inst{51-48} = BrDst;
   let BPFClass = BPF_JMP;
 }
 
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index aec64607e246..52158139a258 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -12,34 +12,52 @@
 //===----------------------------------------------------------------------===//
 
 include "llvm/IR/Intrinsics.td"
+include "llvm/IR/Attributes.td"
 
 // Abstract representation of the class a DXIL Operation belongs to.
-class DxilOpClass<string name> {
+class DXILOpClass<string name> {
   string Name = name;
 }
 
 // Abstract representation of the category a DXIL Operation belongs to
-class DxilOpCategory<string name> {
+class DXILOpCategory<string name> {
   string Name = name;
 }
 
-def UnaryClass : DxilOpClass<"Unary">;
-def BinaryClass : DxilOpClass<"Binary">;
-def FlattenedThreadIdInGroupClass : DxilOpClass<"FlattenedThreadIdInGroup">;
-def ThreadIdInGroupClass : DxilOpClass<"ThreadIdInGroup">;
-def ThreadIdClass : DxilOpClass<"ThreadId">;
-def GroupIdClass : DxilOpClass<"GroupId">;
+def UnaryClass : DXILOpClass<"Unary">;
+def BinaryClass : DXILOpClass<"Binary">;
+def FlattenedThreadIdInGroupClass : DXILOpClass<"FlattenedThreadIdInGroup">;
+def ThreadIdInGroupClass : DXILOpClass<"ThreadIdInGroup">;
+def ThreadIdClass : DXILOpClass<"ThreadId">;
+def GroupIdClass : DXILOpClass<"GroupId">;
 
-def BinaryUintCategory : DxilOpCategory<"Binary uint">;
-def UnaryFloatCategory : DxilOpCategory<"Unary float">;
-def ComputeIDCategory : DxilOpCategory<"Compute/Mesh/Amplification shader">;
+def BinaryUintCategory : DXILOpCategory<"Binary uint">;
+def UnaryFloatCategory : DXILOpCategory<"Unary float">;
+def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
+
+// Following are the scalar types supported by DXIL operations and are synonymous
+// to llvm_*_ty defined for readability and ease of use in the context of this file.
+
+def voidTy  : LLVMType<isVoid>;
+
+// Floating point types
+def f16Ty   : LLVMType<f16>;
+def f32Ty   : LLVMType<f32>;
+def f64Ty   : LLVMType<f64>;
+
+// Integer types
+def i1Ty   : LLVMType<i1>;
+def i8Ty   : LLVMType<i8>;
+def i16Ty  : LLVMType<i16>;
+def i32Ty  : LLVMType<i32>;
+def i64Ty  : LLVMType<i64>;
 
 // The parameter description for a DXIL operation
-class DxilOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, string type, string name, string doc,
                  bit isConstant = 0, string enumName = "",
                  int maxValue = 0> {
   int Pos = pos;               // Position in parameter list
-  string LLVMType = type;      // LLVM type name, $o for overload, $r for resource
+  string Type = type;          // LLVM type name, $o for overload, $r for resource
                                // type, $cb for legacy cbuffer, $u4 for u4 struct
   string Name = name;          // Short, unique parameter name
   string Doc = doc;            // Description of this parameter
@@ -49,19 +67,18 @@ class DxilOpParameter<int pos, string type, string name, string doc,
 }
 
 // A representation for a DXIL operation
-class DxilOperationDesc<string name> {
-  // TODO : Appears redundant. OpName should serve the same purpose
-  string Name = name; // short, unique name
-
+class DXILOperationDesc {
   string OpName = "";         // Name of DXIL operation
   int OpCode = 0;             // Unique non-negative integer associated with the operation
-  DxilOpClass  OpClass;       // Class of the operation
-  DxilOpCategory OpCategory;  // Category of the operation
+  DXILOpClass  OpClass;       // Class of the operation
+  DXILOpCategory OpCategory;  // Category of the operation
   string Doc = "";            // Description of the operation
-  list<DxilOpParameter> Params = []; // Parameter list of the operation
-  string OverloadTypes = "";  // Overload types, if applicable
-  string Attributes = "";     // Attribute shorthands: rn=does not access
-                              // memory,ro=only reads from memory,
+  list<DXILOpParameter> Params = []; // Parameter list of the operation
+  list<LLVMType> OverloadTypes = [];  // Overload types, if applicable
+  EnumAttr Attribute;         // Operation Attribute. Leverage attributes defined in Attributes.td
+                              // ReadNone - operation does not access memory.
+                              // ReadOnly - only reads from memory.
+                              // "ReadMemory"   - reads memory
   bit IsDerivative = 0;       // Whether this is some kind of derivative
   bit IsGradient = 0;         // Whether this requires a gradient calculation
   bit IsFeedback = 0;         // Whether this is a sampler feedback operation
@@ -73,9 +90,9 @@ class DxilOperationDesc<string name> {
   list<string> StatsGroup = [];
 }
 
-class DxilOperation<string name, int opCode, DxilOpClass opClass, DxilOpCategory opCategory, string doc,
-              string oloadTypes, string attrs, list<DxilOpParameter> params,
-              list<string> statsGroup = []> : DxilOperationDesc<name> {
+class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory opCategory, string doc,
+              list<LLVMType> oloadTypes, EnumAttr attrs, list<DXILOpParameter> params,
+              list<string> statsGroup = []> : DXILOperationDesc {
   let OpName = name;
   let OpCode = opCode;
   let Doc = doc;
@@ -83,63 +100,63 @@ class DxilOperation<string name, int opCode, DxilOpClass opClass, DxilOpCategory
   let OpClass = opClass;
   let OpCategory = opCategory;
   let OverloadTypes = oloadTypes;
-  let Attributes = attrs;
+  let Attribute = attrs;
   let StatsGroup = statsGroup;
 }
 
 // LLVM intrinsic that DXIL operation maps to.
 class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
-def Sin : DxilOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
-  "half;float;", "rn",
+def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
+  [f16Ty,f32Ty], ReadNone,
   [
-    DxilOpParameter<0, "$o", "", "operation result">,
-    DxilOpParameter<1, "i32", "opcode", "DXIL opcode">,
-    DxilOpParameter<2, "$o", "value", "input value">
+    DXILOpParameter<0, "$o", "", "operation result">,
+    DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
+    DXILOpParameter<2, "$o", "value", "input value">
   ],
   ["floats"]>,
   LLVMIntrinsic<int_sin>;
 
-def UMax : DxilOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    "i16;i32;i64;",  "rn",
+def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+    [i16Ty,i32Ty,i64Ty],  ReadNone,
   [
-    DxilOpParameter<0,  "$o",  "",  "operation result">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "$o",  "a",  "input value">,
-    DxilOpParameter<3,  "$o",  "b",  "input value">
+    DXILOpParameter<0,  "$o",  "",  "operation result">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "$o",  "a",  "input value">,
+    DXILOpParameter<3,  "$o",  "b",  "input value">
   ],
   ["uints"]>,
   LLVMIntrinsic<int_umax>;
 
-def ThreadId : DxilOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", "i32;",  "rn",
+def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty],  ReadNone,
   [
-    DxilOpParameter<0,  "i32",  "",  "thread ID component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0,  "i32",  "",  "thread ID component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id>;
 
-def GroupId : DxilOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", "i32;",  "rn",
+def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty],  ReadNone,
   [
-    DxilOpParameter<0,  "i32",  "",  "group ID component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read">
+    DXILOpParameter<0,  "i32",  "",  "group ID component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read">
   ]>,
   LLVMIntrinsic<int_dx_group_id>;
 
-def ThreadIdInGroup : DxilOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
-  "reads the thread ID within the group (SV_GroupThreadID)", "i32;",  "rn",
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
+  "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty],  ReadNone,
   [
-    DxilOpParameter<0,  "i32",  "",  "thread ID in group component">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DxilOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0,  "i32",  "",  "thread ID in group component">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
+    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id_in_group>;
 
-def FlattenedThreadIdInGroup : DxilOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
-   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;",  "rn",
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
+   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty],  ReadNone,
   [
-    DxilOpParameter<0,  "i32",  "",  "result">,
-    DxilOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
+    DXILOpParameter<0,  "i32",  "",  "result">,
+    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
   ]>,
   LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/DirectX/DXILMetadata.cpp b/llvm/lib/Target/DirectX/DXILMetadata.cpp
index db55f25c5077..2d94490a7f24 100644
--- a/llvm/lib/Target/DirectX/DXILMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILMetadata.cpp
@@ -213,7 +213,7 @@ public:
     // FIXME: add signature for profile other than CS.
     // See https://github.com/llvm/llvm-project/issues/57928.
     MDTuple *Signatures = nullptr;
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         &F, F.getName().str(), Signatures, Resources,
         Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ false), Ctx);
   }
@@ -222,7 +222,7 @@ public:
     // FIXME: add signature for profile other than CS.
     // See https://github.com/llvm/llvm-project/issues/57928.
     MDTuple *Signatures = nullptr;
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         &F, F.getName().str(), Signatures,
         /*entry in lib doesn't need resources metadata*/ nullptr,
         Props.emitDXILEntryProps(RawShaderFlag, Ctx, /*IsLib*/ true), Ctx);
@@ -233,13 +233,13 @@ public:
   static MDTuple *emitEmptyEntryForLib(MDTuple *Resources,
                                        uint64_t RawShaderFlag,
                                        LLVMContext &Ctx) {
-    return emitDxilEntryPointTuple(
+    return emitDXILEntryPointTuple(
         nullptr, "", nullptr, Resources,
         EntryProps::emitEntryPropsForEmptyEntry(RawShaderFlag, Ctx), Ctx);
   }
 
 private:
-  static MDTuple *emitDxilEntryPointTuple(Function *Fn, const std::string &Name,
+  static MDTuple *emitDXILEntryPointTuple(Function *Fn, const std::string &Name,
                                           MDTuple *Signatures,
                                           MDTuple *Resources,
                                           MDTuple *Properties,
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 812e5f7ad930..232651132d6e 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -2688,3 +2688,67 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
 
   return false;
 }
+
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct HexagonFrameSortingObject {
+  bool IsValid = false;
+  unsigned Index = 0; // Index of Object into MFI list.
+  unsigned Size = 0;
+  Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
+};
+
+struct HexagonFrameSortingComparator {
+  inline bool operator()(const HexagonFrameSortingObject &A,
+                         const HexagonFrameSortingObject &B) const {
+    return std::make_tuple(!A.IsValid, A.ObjectAlignment, A.Size) <
+           std::make_tuple(!B.IsValid, B.ObjectAlignment, B.Size);
+  }
+};
+} // namespace
+
+// Sort objects on the stack by alignment value and then by size to minimize
+// padding.
+void HexagonFrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+
+  if (ObjectsToAllocate.empty())
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  int NObjects = ObjectsToAllocate.size();
+
+  // Create an array of all MFI objects.
+  SmallVector<HexagonFrameSortingObject> SortingObjects(
+      MFI.getObjectIndexEnd());
+
+  for (int i = 0, j = 0, e = MFI.getObjectIndexEnd(); i < e && j != NObjects;
+       ++i) {
+    if (i != ObjectsToAllocate[j])
+      continue;
+    j++;
+
+    // A variable size object has size equal to 0. Since Hexagon sets
+    // getUseLocalStackAllocationBlock() to true, a local block is allocated
+    // earlier. This case is not handled here for now.
+    int Size = MFI.getObjectSize(i);
+    if (Size == 0)
+      return;
+
+    SortingObjects[i].IsValid = true;
+    SortingObjects[i].Index = i;
+    SortingObjects[i].Size = Size;
+    SortingObjects[i].ObjectAlignment = MFI.getObjectAlign(i);
+  }
+
+  // Sort objects by alignment and then by size.
+  llvm::stable_sort(SortingObjects, HexagonFrameSortingComparator());
+
+  // Modify the original list to represent the final order.
+  int i = NObjects;
+  for (auto &Obj : SortingObjects) {
+    if (i == 0)
+      break;
+    ObjectsToAllocate[--i] = Obj.Index;
+  }
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index b2222f08a710..98e69dcc4b39 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -35,6 +35,10 @@ public:
   explicit HexagonFrameLowering()
       : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align(1), true) {}
 
+  void
+  orderFrameObjects(const MachineFunction &MF,
+                    SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
   // All of the prolog/epilog functionality, including saving and restoring
   // callee-saved registers is handled in emitPrologue. This is to have the
   // logic for shrink-wrapping in one place.
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index aa31762a9696..0e82bf6e5331 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -47,6 +47,8 @@ static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
   "optimization"));
 
+extern cl::opt<unsigned> RDFFuncBlockLimit;
+
 namespace llvm {
 
   FunctionPass *createHexagonOptAddrMode();
@@ -856,6 +858,14 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
+  // Perform RDF optimizations only if number of basic blocks in the
+  // function is less than the limit
+  if (MF.size() > RDFFuncBlockLimit) {
+    LLVM_DEBUG(dbgs() << "Skipping " << getPassName()
+                      << ": too many basic blocks\n");
+    return false;
+  }
+
   bool Changed = false;
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 7eccbd2cb023..4131f2a31755 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -50,6 +50,9 @@ static unsigned RDFCount = 0;
 static cl::opt<unsigned>
     RDFLimit("hexagon-rdf-limit",
              cl::init(std::numeric_limits<unsigned>::max()));
+
+extern cl::opt<unsigned> RDFFuncBlockLimit;
+
 static cl::opt<bool> RDFDump("hexagon-rdf-dump", cl::Hidden);
 static cl::opt<bool> RDFTrackReserved("hexagon-rdf-track-reserved", cl::Hidden);
 
@@ -285,6 +288,14 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
+  // Perform RDF optimizations only if number of basic blocks in the
+  // function is less than the limit
+  if (MF.size() > RDFFuncBlockLimit) {
+    if (RDFDump)
+      dbgs() << "Skipping " << getPassName() << ": too many basic blocks\n";
+    return false;
+  }
+
   if (RDFLimit.getPosition()) {
     if (RDFCount >= RDFLimit)
       return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e7a692d67ba0..7d4b420071c4 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -39,6 +39,10 @@ static cl::opt<bool>
 static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::init(true),
                                   cl::desc("Enable RDF-based optimizations"));
 
+cl::opt<unsigned> RDFFuncBlockLimit(
+    "rdf-bb-limit", cl::Hidden, cl::init(1000),
+    cl::desc("Basic block limit for a function for RDF optimizations"));
+
 static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 4478a574a240..654f29d08af0 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -357,13 +357,6 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
              .addImm(0);
     break;
   }
-  case G_BRCOND: {
-    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::BNE))
-             .add(I.getOperand(0))
-             .addUse(Mips::ZERO)
-             .add(I.getOperand(1));
-    break;
-  }
   case G_BRJT: {
     unsigned EntrySize =
         MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout());
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6c4879ba183c..2219d9f6619a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -57,6 +57,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -605,14 +606,33 @@ void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
   O << getVirtualRegisterName(vr);
 }
 
+void NVPTXAsmPrinter::emitAliasDeclaration(const GlobalAlias *GA,
+                                           raw_ostream &O) {
+  const Function *F = dyn_cast_or_null<Function>(GA->getAliaseeObject());
+  if (!F || isKernelFunction(*F) || F->isDeclaration())
+    report_fatal_error(
+        "NVPTX aliasee must be a non-kernel function definition");
+
+  if (GA->hasLinkOnceLinkage() || GA->hasWeakLinkage() ||
+      GA->hasAvailableExternallyLinkage() || GA->hasCommonLinkage())
+    report_fatal_error("NVPTX aliasee must not be '.weak'");
+
+  emitDeclarationWithName(F, getSymbol(GA), O);
+}
+
 void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+  emitDeclarationWithName(F, getSymbol(F), O);
+}
+
+void NVPTXAsmPrinter::emitDeclarationWithName(const Function *F, MCSymbol *S,
+                                              raw_ostream &O) {
   emitLinkageDirective(F, O);
   if (isKernelFunction(*F))
     O << ".entry ";
   else
     O << ".func ";
   printReturnValStr(F, O);
-  getSymbol(F)->print(O, MAI);
+  S->print(O, MAI);
   O << "\n";
   emitFunctionParamList(F, O);
   O << "\n";
@@ -759,6 +779,8 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
     }
     seenMap[&F] = true;
   }
+  for (const GlobalAlias &GA : M.aliases())
+    emitAliasDeclaration(&GA, O);
 }
 
 static bool isEmptyXXStructor(GlobalVariable *GV) {
@@ -853,25 +875,9 @@ void NVPTXAsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
   raw_svector_ostream OS(Str);
 
   MCSymbol *Name = getSymbol(&GA);
-  const Function *F = dyn_cast<Function>(GA.getAliasee());
-  if (!F || isKernelFunction(*F))
-    report_fatal_error("NVPTX aliasee must be a non-kernel function");
-
-  if (GA.hasLinkOnceLinkage() || GA.hasWeakLinkage() ||
-      GA.hasAvailableExternallyLinkage() || GA.hasCommonLinkage())
-    report_fatal_error("NVPTX aliasee must not be '.weak'");
-
-  OS << "\n";
-  emitLinkageDirective(F, OS);
-  OS << ".func ";
-  printReturnValStr(F, OS);
-  OS << Name->getName();
-  emitFunctionParamList(F, OS);
-  if (shouldEmitPTXNoReturn(F, TM))
-    OS << "\n.noreturn";
-  OS << ";\n";
 
-  OS << ".alias " << Name->getName() << ", " << F->getName() << ";\n";
+  OS << ".alias " << Name->getName() << ", " << GA.getAliaseeObject()->getName()
+     << ";\n";
 
   OutStreamer->emitRawText(OS.str());
 }
@@ -932,16 +938,6 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
     GlobalsEmitted = true;
   }
 
-  // If we have any aliases we emit them at the end.
-  SmallVector<GlobalAlias *> AliasesToRemove;
-  for (GlobalAlias &Alias : M.aliases()) {
-    emitGlobalAlias(M, Alias);
-    AliasesToRemove.push_back(&Alias);
-  }
-
-  for (GlobalAlias *A : AliasesToRemove)
-    A->eraseFromParent();
-
   // call doFinalization
   bool ret = AsmPrinter::doFinalization(M);
 
@@ -1530,7 +1526,6 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     if (isKernelFunction(*F)) {
       if (isSampler(*I) || isImage(*I)) {
         if (isImage(*I)) {
-          std::string sname = std::string(I->getName());
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
             if (hasImageHandles)
               O << "\t.param .u64 .ptr .surfref ";
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7f0f37e7207d..979d185a97f7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCExpr.h"
@@ -174,7 +175,7 @@ private:
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool processDemoted, const NVPTXSubtarget &STI);
   void emitGlobals(const Module &M);
-  void emitGlobalAlias(const Module &M, const GlobalAlias &GA);
+  void emitGlobalAlias(const Module &M, const GlobalAlias &GA) override;
   void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, raw_ostream &);
@@ -222,6 +223,8 @@ private:
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
   void emitDeclarations(const Module &, raw_ostream &O);
   void emitDeclaration(const Function *, raw_ostream &O);
+  void emitAliasDeclaration(const GlobalAlias *, raw_ostream &O);
+  void emitDeclarationWithName(const Function *, MCSymbol *, raw_ostream &O);
   void emitDemotedVars(const Function *, raw_ostream &);
 
   bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 24e0be2b9566..7f58b312e7a2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -489,6 +489,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
 
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
+    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
+
   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
 
@@ -858,623 +862,432 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+
+#define MAKE_CASE(V)                                                           \
+  case V:                                                                      \
+    return #V;
+
   switch ((NVPTXISD::NodeType)Opcode) {
   case NVPTXISD::FIRST_NUMBER:
     break;
-  case NVPTXISD::CALL:
-    return "NVPTXISD::CALL";
-  case NVPTXISD::RET_GLUE:
-    return "NVPTXISD::RET_GLUE";
-  case NVPTXISD::LOAD_PARAM:
-    return "NVPTXISD::LOAD_PARAM";
-  case NVPTXISD::Wrapper:
-    return "NVPTXISD::Wrapper";
-  case NVPTXISD::DeclareParam:
-    return "NVPTXISD::DeclareParam";
-  case NVPTXISD::DeclareScalarParam:
-    return "NVPTXISD::DeclareScalarParam";
-  case NVPTXISD::DeclareRet:
-    return "NVPTXISD::DeclareRet";
-  case NVPTXISD::DeclareScalarRet:
-    return "NVPTXISD::DeclareScalarRet";
-  case NVPTXISD::DeclareRetParam:
-    return "NVPTXISD::DeclareRetParam";
-  case NVPTXISD::PrintCall:
-    return "NVPTXISD::PrintCall";
-  case NVPTXISD::PrintConvergentCall:
-    return "NVPTXISD::PrintConvergentCall";
-  case NVPTXISD::PrintCallUni:
-    return "NVPTXISD::PrintCallUni";
-  case NVPTXISD::PrintConvergentCallUni:
-    return "NVPTXISD::PrintConvergentCallUni";
-  case NVPTXISD::LoadParam:
-    return "NVPTXISD::LoadParam";
-  case NVPTXISD::LoadParamV2:
-    return "NVPTXISD::LoadParamV2";
-  case NVPTXISD::LoadParamV4:
-    return "NVPTXISD::LoadParamV4";
-  case NVPTXISD::StoreParam:
-    return "NVPTXISD::StoreParam";
-  case NVPTXISD::StoreParamV2:
-    return "NVPTXISD::StoreParamV2";
-  case NVPTXISD::StoreParamV4:
-    return "NVPTXISD::StoreParamV4";
-  case NVPTXISD::StoreParamS32:
-    return "NVPTXISD::StoreParamS32";
-  case NVPTXISD::StoreParamU32:
-    return "NVPTXISD::StoreParamU32";
-  case NVPTXISD::CallArgBegin:
-    return "NVPTXISD::CallArgBegin";
-  case NVPTXISD::CallArg:
-    return "NVPTXISD::CallArg";
-  case NVPTXISD::LastCallArg:
-    return "NVPTXISD::LastCallArg";
-  case NVPTXISD::CallArgEnd:
-    return "NVPTXISD::CallArgEnd";
-  case NVPTXISD::CallVoid:
-    return "NVPTXISD::CallVoid";
-  case NVPTXISD::CallVal:
-    return "NVPTXISD::CallVal";
-  case NVPTXISD::CallSymbol:
-    return "NVPTXISD::CallSymbol";
-  case NVPTXISD::Prototype:
-    return "NVPTXISD::Prototype";
-  case NVPTXISD::MoveParam:
-    return "NVPTXISD::MoveParam";
-  case NVPTXISD::StoreRetval:
-    return "NVPTXISD::StoreRetval";
-  case NVPTXISD::StoreRetvalV2:
-    return "NVPTXISD::StoreRetvalV2";
-  case NVPTXISD::StoreRetvalV4:
-    return "NVPTXISD::StoreRetvalV4";
-  case NVPTXISD::PseudoUseParam:
-    return "NVPTXISD::PseudoUseParam";
-  case NVPTXISD::RETURN:
-    return "NVPTXISD::RETURN";
-  case NVPTXISD::CallSeqBegin:
-    return "NVPTXISD::CallSeqBegin";
-  case NVPTXISD::CallSeqEnd:
-    return "NVPTXISD::CallSeqEnd";
-  case NVPTXISD::CallPrototype:
-    return "NVPTXISD::CallPrototype";
-  case NVPTXISD::ProxyReg:
-    return "NVPTXISD::ProxyReg";
-  case NVPTXISD::LoadV2:
-    return "NVPTXISD::LoadV2";
-  case NVPTXISD::LoadV4:
-    return "NVPTXISD::LoadV4";
-  case NVPTXISD::LDGV2:
-    return "NVPTXISD::LDGV2";
-  case NVPTXISD::LDGV4:
-    return "NVPTXISD::LDGV4";
-  case NVPTXISD::LDUV2:
-    return "NVPTXISD::LDUV2";
-  case NVPTXISD::LDUV4:
-    return "NVPTXISD::LDUV4";
-  case NVPTXISD::StoreV2:
-    return "NVPTXISD::StoreV2";
-  case NVPTXISD::StoreV4:
-    return "NVPTXISD::StoreV4";
-  case NVPTXISD::FUN_SHFL_CLAMP:
-    return "NVPTXISD::FUN_SHFL_CLAMP";
-  case NVPTXISD::FUN_SHFR_CLAMP:
-    return "NVPTXISD::FUN_SHFR_CLAMP";
-  case NVPTXISD::IMAD:
-    return "NVPTXISD::IMAD";
-  case NVPTXISD::BFE:
-    return "NVPTXISD::BFE";
-  case NVPTXISD::BFI:
-    return "NVPTXISD::BFI";
-  case NVPTXISD::PRMT:
-    return "NVPTXISD::PRMT";
-  case NVPTXISD::SETP_F16X2:
-    return "NVPTXISD::SETP_F16X2";
-  case NVPTXISD::SETP_BF16X2:
-    return "NVPTXISD::SETP_BF16X2";
-  case NVPTXISD::Dummy:
-    return "NVPTXISD::Dummy";
-  case NVPTXISD::MUL_WIDE_SIGNED:
-    return "NVPTXISD::MUL_WIDE_SIGNED";
-  case NVPTXISD::MUL_WIDE_UNSIGNED:
-    return "NVPTXISD::MUL_WIDE_UNSIGNED";
-  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
-  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
-  case NVPTXISD::Tex1DFloatFloatLevel:
-    return "NVPTXISD::Tex1DFloatFloatLevel";
-  case NVPTXISD::Tex1DFloatFloatGrad:
-    return "NVPTXISD::Tex1DFloatFloatGrad";
-  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
-  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
-  case NVPTXISD::Tex1DS32FloatLevel:
-    return "NVPTXISD::Tex1DS32FloatLevel";
-  case NVPTXISD::Tex1DS32FloatGrad:
-    return "NVPTXISD::Tex1DS32FloatGrad";
-  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
-  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
-  case NVPTXISD::Tex1DU32FloatLevel:
-    return "NVPTXISD::Tex1DU32FloatLevel";
-  case NVPTXISD::Tex1DU32FloatGrad:
-    return "NVPTXISD::Tex1DU32FloatGrad";
-  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
-  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
-  case NVPTXISD::Tex1DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
-  case NVPTXISD::Tex1DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
-  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
-  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
-  case NVPTXISD::Tex1DArrayS32FloatLevel:
-    return "NVPTXISD::Tex1DArrayS32FloatLevel";
-  case NVPTXISD::Tex1DArrayS32FloatGrad:
-    return "NVPTXISD::Tex1DArrayS32FloatGrad";
-  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
-  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
-  case NVPTXISD::Tex1DArrayU32FloatLevel:
-    return "NVPTXISD::Tex1DArrayU32FloatLevel";
-  case NVPTXISD::Tex1DArrayU32FloatGrad:
-    return "NVPTXISD::Tex1DArrayU32FloatGrad";
-  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
-  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
-  case NVPTXISD::Tex2DFloatFloatLevel:
-    return "NVPTXISD::Tex2DFloatFloatLevel";
-  case NVPTXISD::Tex2DFloatFloatGrad:
-    return "NVPTXISD::Tex2DFloatFloatGrad";
-  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
-  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
-  case NVPTXISD::Tex2DS32FloatLevel:
-    return "NVPTXISD::Tex2DS32FloatLevel";
-  case NVPTXISD::Tex2DS32FloatGrad:
-    return "NVPTXISD::Tex2DS32FloatGrad";
-  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
-  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
-  case NVPTXISD::Tex2DU32FloatLevel:
-    return "NVPTXISD::Tex2DU32FloatLevel";
-  case NVPTXISD::Tex2DU32FloatGrad:
-    return "NVPTXISD::Tex2DU32FloatGrad";
-  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
-  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
-  case NVPTXISD::Tex2DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
-  case NVPTXISD::Tex2DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
-  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
-  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
-  case NVPTXISD::Tex2DArrayS32FloatLevel:
-    return "NVPTXISD::Tex2DArrayS32FloatLevel";
-  case NVPTXISD::Tex2DArrayS32FloatGrad:
-    return "NVPTXISD::Tex2DArrayS32FloatGrad";
-  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
-  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
-  case NVPTXISD::Tex2DArrayU32FloatLevel:
-    return "NVPTXISD::Tex2DArrayU32FloatLevel";
-  case NVPTXISD::Tex2DArrayU32FloatGrad:
-    return "NVPTXISD::Tex2DArrayU32FloatGrad";
-  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
-  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
-  case NVPTXISD::Tex3DFloatFloatLevel:
-    return "NVPTXISD::Tex3DFloatFloatLevel";
-  case NVPTXISD::Tex3DFloatFloatGrad:
-    return "NVPTXISD::Tex3DFloatFloatGrad";
-  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
-  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
-  case NVPTXISD::Tex3DS32FloatLevel:
-    return "NVPTXISD::Tex3DS32FloatLevel";
-  case NVPTXISD::Tex3DS32FloatGrad:
-    return "NVPTXISD::Tex3DS32FloatGrad";
-  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
-  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
-  case NVPTXISD::Tex3DU32FloatLevel:
-    return "NVPTXISD::Tex3DU32FloatLevel";
-  case NVPTXISD::Tex3DU32FloatGrad:
-    return "NVPTXISD::Tex3DU32FloatGrad";
-  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
-  case NVPTXISD::TexCubeFloatFloatLevel:
-    return "NVPTXISD::TexCubeFloatFloatLevel";
-  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
-  case NVPTXISD::TexCubeS32FloatLevel:
-    return "NVPTXISD::TexCubeS32FloatLevel";
-  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
-  case NVPTXISD::TexCubeU32FloatLevel:
-    return "NVPTXISD::TexCubeU32FloatLevel";
-  case NVPTXISD::TexCubeArrayFloatFloat:
-    return "NVPTXISD::TexCubeArrayFloatFloat";
-  case NVPTXISD::TexCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexCubeArrayS32Float:
-    return "NVPTXISD::TexCubeArrayS32Float";
-  case NVPTXISD::TexCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexCubeArrayS32FloatLevel";
-  case NVPTXISD::TexCubeArrayU32Float:
-    return "NVPTXISD::TexCubeArrayU32Float";
-  case NVPTXISD::TexCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexCubeArrayU32FloatLevel";
-  case NVPTXISD::Tld4R2DFloatFloat:
-    return "NVPTXISD::Tld4R2DFloatFloat";
-  case NVPTXISD::Tld4G2DFloatFloat:
-    return "NVPTXISD::Tld4G2DFloatFloat";
-  case NVPTXISD::Tld4B2DFloatFloat:
-    return "NVPTXISD::Tld4B2DFloatFloat";
-  case NVPTXISD::Tld4A2DFloatFloat:
-    return "NVPTXISD::Tld4A2DFloatFloat";
-  case NVPTXISD::Tld4R2DS64Float:
-    return "NVPTXISD::Tld4R2DS64Float";
-  case NVPTXISD::Tld4G2DS64Float:
-    return "NVPTXISD::Tld4G2DS64Float";
-  case NVPTXISD::Tld4B2DS64Float:
-    return "NVPTXISD::Tld4B2DS64Float";
-  case NVPTXISD::Tld4A2DS64Float:
-    return "NVPTXISD::Tld4A2DS64Float";
-  case NVPTXISD::Tld4R2DU64Float:
-    return "NVPTXISD::Tld4R2DU64Float";
-  case NVPTXISD::Tld4G2DU64Float:
-    return "NVPTXISD::Tld4G2DU64Float";
-  case NVPTXISD::Tld4B2DU64Float:
-    return "NVPTXISD::Tld4B2DU64Float";
-  case NVPTXISD::Tld4A2DU64Float:
-    return "NVPTXISD::Tld4A2DU64Float";
-
-  case NVPTXISD::TexUnified1DFloatS32:
-    return "NVPTXISD::TexUnified1DFloatS32";
-  case NVPTXISD::TexUnified1DFloatFloat:
-    return "NVPTXISD::TexUnified1DFloatFloat";
-  case NVPTXISD::TexUnified1DFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DFloatFloatLevel";
-  case NVPTXISD::TexUnified1DFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DFloatFloatGrad";
-  case NVPTXISD::TexUnified1DS32S32:
-    return "NVPTXISD::TexUnified1DS32S32";
-  case NVPTXISD::TexUnified1DS32Float:
-    return "NVPTXISD::TexUnified1DS32Float";
-  case NVPTXISD::TexUnified1DS32FloatLevel:
-    return "NVPTXISD::TexUnified1DS32FloatLevel";
-  case NVPTXISD::TexUnified1DS32FloatGrad:
-    return "NVPTXISD::TexUnified1DS32FloatGrad";
-  case NVPTXISD::TexUnified1DU32S32:
-    return "NVPTXISD::TexUnified1DU32S32";
-  case NVPTXISD::TexUnified1DU32Float:
-    return "NVPTXISD::TexUnified1DU32Float";
-  case NVPTXISD::TexUnified1DU32FloatLevel:
-    return "NVPTXISD::TexUnified1DU32FloatLevel";
-  case NVPTXISD::TexUnified1DU32FloatGrad:
-    return "NVPTXISD::TexUnified1DU32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayFloatS32:
-    return "NVPTXISD::TexUnified1DArrayFloatS32";
-  case NVPTXISD::TexUnified1DArrayFloatFloat:
-    return "NVPTXISD::TexUnified1DArrayFloatFloat";
-  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified1DArrayS32S32:
-    return "NVPTXISD::TexUnified1DArrayS32S32";
-  case NVPTXISD::TexUnified1DArrayS32Float:
-    return "NVPTXISD::TexUnified1DArrayS32Float";
-  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayU32S32:
-    return "NVPTXISD::TexUnified1DArrayU32S32";
-  case NVPTXISD::TexUnified1DArrayU32Float:
-    return "NVPTXISD::TexUnified1DArrayU32Float";
-  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified2DFloatS32:
-    return "NVPTXISD::TexUnified2DFloatS32";
-  case NVPTXISD::TexUnified2DFloatFloat:
-    return "NVPTXISD::TexUnified2DFloatFloat";
-  case NVPTXISD::TexUnified2DFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DFloatFloatLevel";
-  case NVPTXISD::TexUnified2DFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DFloatFloatGrad";
-  case NVPTXISD::TexUnified2DS32S32:
-    return "NVPTXISD::TexUnified2DS32S32";
-  case NVPTXISD::TexUnified2DS32Float:
-    return "NVPTXISD::TexUnified2DS32Float";
-  case NVPTXISD::TexUnified2DS32FloatLevel:
-    return "NVPTXISD::TexUnified2DS32FloatLevel";
-  case NVPTXISD::TexUnified2DS32FloatGrad:
-    return "NVPTXISD::TexUnified2DS32FloatGrad";
-  case NVPTXISD::TexUnified2DU32S32:
-    return "NVPTXISD::TexUnified2DU32S32";
-  case NVPTXISD::TexUnified2DU32Float:
-    return "NVPTXISD::TexUnified2DU32Float";
-  case NVPTXISD::TexUnified2DU32FloatLevel:
-    return "NVPTXISD::TexUnified2DU32FloatLevel";
-  case NVPTXISD::TexUnified2DU32FloatGrad:
-    return "NVPTXISD::TexUnified2DU32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayFloatS32:
-    return "NVPTXISD::TexUnified2DArrayFloatS32";
-  case NVPTXISD::TexUnified2DArrayFloatFloat:
-    return "NVPTXISD::TexUnified2DArrayFloatFloat";
-  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified2DArrayS32S32:
-    return "NVPTXISD::TexUnified2DArrayS32S32";
-  case NVPTXISD::TexUnified2DArrayS32Float:
-    return "NVPTXISD::TexUnified2DArrayS32Float";
-  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayU32S32:
-    return "NVPTXISD::TexUnified2DArrayU32S32";
-  case NVPTXISD::TexUnified2DArrayU32Float:
-    return "NVPTXISD::TexUnified2DArrayU32Float";
-  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified3DFloatS32:
-    return "NVPTXISD::TexUnified3DFloatS32";
-  case NVPTXISD::TexUnified3DFloatFloat:
-    return "NVPTXISD::TexUnified3DFloatFloat";
-  case NVPTXISD::TexUnified3DFloatFloatLevel:
-    return "NVPTXISD::TexUnified3DFloatFloatLevel";
-  case NVPTXISD::TexUnified3DFloatFloatGrad:
-    return "NVPTXISD::TexUnified3DFloatFloatGrad";
-  case NVPTXISD::TexUnified3DS32S32:
-    return "NVPTXISD::TexUnified3DS32S32";
-  case NVPTXISD::TexUnified3DS32Float:
-    return "NVPTXISD::TexUnified3DS32Float";
-  case NVPTXISD::TexUnified3DS32FloatLevel:
-    return "NVPTXISD::TexUnified3DS32FloatLevel";
-  case NVPTXISD::TexUnified3DS32FloatGrad:
-    return "NVPTXISD::TexUnified3DS32FloatGrad";
-  case NVPTXISD::TexUnified3DU32S32:
-    return "NVPTXISD::TexUnified3DU32S32";
-  case NVPTXISD::TexUnified3DU32Float:
-    return "NVPTXISD::TexUnified3DU32Float";
-  case NVPTXISD::TexUnified3DU32FloatLevel:
-    return "NVPTXISD::TexUnified3DU32FloatLevel";
-  case NVPTXISD::TexUnified3DU32FloatGrad:
-    return "NVPTXISD::TexUnified3DU32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeFloatFloat";
-  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeS32Float:
-    return "NVPTXISD::TexUnifiedCubeS32Float";
-  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeU32Float:
-    return "NVPTXISD::TexUnifiedCubeU32Float";
-  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayS32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
-  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayU32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
-  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
-    return "NVPTXISD::TexUnifiedCubeFloatFloatGrad";
-  case NVPTXISD::TexUnifiedCubeS32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeS32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeU32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeU32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
-    return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad";
-  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedR2DS64Float:
-    return "NVPTXISD::Tld4UnifiedR2DS64Float";
-  case NVPTXISD::Tld4UnifiedG2DS64Float:
-    return "NVPTXISD::Tld4UnifiedG2DS64Float";
-  case NVPTXISD::Tld4UnifiedB2DS64Float:
-    return "NVPTXISD::Tld4UnifiedB2DS64Float";
-  case NVPTXISD::Tld4UnifiedA2DS64Float:
-    return "NVPTXISD::Tld4UnifiedA2DS64Float";
-  case NVPTXISD::Tld4UnifiedR2DU64Float:
-    return "NVPTXISD::Tld4UnifiedR2DU64Float";
-  case NVPTXISD::Tld4UnifiedG2DU64Float:
-    return "NVPTXISD::Tld4UnifiedG2DU64Float";
-  case NVPTXISD::Tld4UnifiedB2DU64Float:
-    return "NVPTXISD::Tld4UnifiedB2DU64Float";
-  case NVPTXISD::Tld4UnifiedA2DU64Float:
-    return "NVPTXISD::Tld4UnifiedA2DU64Float";
-
-  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
-  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
-  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
-  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
-  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
-  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
-  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
-  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
-  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
-  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
-  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
-
-  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
-  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
-  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
-  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
-  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
-  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
-  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
-  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
-  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
-  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
-  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
-  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
-  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
-  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
-  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
-  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
-  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
-  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
-  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
-  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
-  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
-
-  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
-  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
-  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
-  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
-  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
-  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
-  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
-  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
-  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
-  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
-  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
-  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
-  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
-  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
-  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
-  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
-  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
-  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
-  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
-  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
-  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
-
-  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
-  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
-  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
-  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
-  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
-  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
-  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
-  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
-  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
-  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
-  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
-
-  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
-  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
-  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
-  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
-  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
-  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
-  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
-  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
-  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
-  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
-  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
-
-  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
-  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
-  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
-  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
-  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
-  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
-  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
-  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
-  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
-  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
-  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
-
-  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
-  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
-  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
-  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
-  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
-  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
-  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
-  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
-  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
-  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
-  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
-
-  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
-  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
-  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
-  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
-  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
-  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
-  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
-  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
-  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
-  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
-  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
-
-  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
-  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
-  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
-  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
-  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
-  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
-  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
-  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
-  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
-  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
-  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
-
-  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
-  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
-  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
-  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
-  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
-  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
-  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
-  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
-  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
-  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
-  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
-
-  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
-  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
-  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
-  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
-  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
-  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
-  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
-  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
-  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
-  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
-  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
-
-  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
-  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
-  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
-  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
-  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
-  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
-  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
-  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
-  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
-  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
-  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
-
-  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
-  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
-  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
-  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
-  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
-  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
-  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
-  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
-  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
-  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
-  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+
+    MAKE_CASE(NVPTXISD::CALL)
+    MAKE_CASE(NVPTXISD::RET_GLUE)
+    MAKE_CASE(NVPTXISD::LOAD_PARAM)
+    MAKE_CASE(NVPTXISD::Wrapper)
+    MAKE_CASE(NVPTXISD::DeclareParam)
+    MAKE_CASE(NVPTXISD::DeclareScalarParam)
+    MAKE_CASE(NVPTXISD::DeclareRet)
+    MAKE_CASE(NVPTXISD::DeclareScalarRet)
+    MAKE_CASE(NVPTXISD::DeclareRetParam)
+    MAKE_CASE(NVPTXISD::PrintCall)
+    MAKE_CASE(NVPTXISD::PrintConvergentCall)
+    MAKE_CASE(NVPTXISD::PrintCallUni)
+    MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
+    MAKE_CASE(NVPTXISD::LoadParam)
+    MAKE_CASE(NVPTXISD::LoadParamV2)
+    MAKE_CASE(NVPTXISD::LoadParamV4)
+    MAKE_CASE(NVPTXISD::StoreParam)
+    MAKE_CASE(NVPTXISD::StoreParamV2)
+    MAKE_CASE(NVPTXISD::StoreParamV4)
+    MAKE_CASE(NVPTXISD::StoreParamS32)
+    MAKE_CASE(NVPTXISD::StoreParamU32)
+    MAKE_CASE(NVPTXISD::CallArgBegin)
+    MAKE_CASE(NVPTXISD::CallArg)
+    MAKE_CASE(NVPTXISD::LastCallArg)
+    MAKE_CASE(NVPTXISD::CallArgEnd)
+    MAKE_CASE(NVPTXISD::CallVoid)
+    MAKE_CASE(NVPTXISD::CallVal)
+    MAKE_CASE(NVPTXISD::CallSymbol)
+    MAKE_CASE(NVPTXISD::Prototype)
+    MAKE_CASE(NVPTXISD::MoveParam)
+    MAKE_CASE(NVPTXISD::StoreRetval)
+    MAKE_CASE(NVPTXISD::StoreRetvalV2)
+    MAKE_CASE(NVPTXISD::StoreRetvalV4)
+    MAKE_CASE(NVPTXISD::PseudoUseParam)
+    MAKE_CASE(NVPTXISD::RETURN)
+    MAKE_CASE(NVPTXISD::CallSeqBegin)
+    MAKE_CASE(NVPTXISD::CallSeqEnd)
+    MAKE_CASE(NVPTXISD::CallPrototype)
+    MAKE_CASE(NVPTXISD::ProxyReg)
+    MAKE_CASE(NVPTXISD::LoadV2)
+    MAKE_CASE(NVPTXISD::LoadV4)
+    MAKE_CASE(NVPTXISD::LDGV2)
+    MAKE_CASE(NVPTXISD::LDGV4)
+    MAKE_CASE(NVPTXISD::LDUV2)
+    MAKE_CASE(NVPTXISD::LDUV4)
+    MAKE_CASE(NVPTXISD::StoreV2)
+    MAKE_CASE(NVPTXISD::StoreV4)
+    MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
+    MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
+    MAKE_CASE(NVPTXISD::IMAD)
+    MAKE_CASE(NVPTXISD::BFE)
+    MAKE_CASE(NVPTXISD::BFI)
+    MAKE_CASE(NVPTXISD::PRMT)
+    MAKE_CASE(NVPTXISD::SETP_F16X2)
+    MAKE_CASE(NVPTXISD::SETP_BF16X2)
+    MAKE_CASE(NVPTXISD::Dummy)
+    MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
+    MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
+    MAKE_CASE(NVPTXISD::Tex1DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DS32S32)
+    MAKE_CASE(NVPTXISD::Tex1DS32Float)
+    MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DU32S32)
+    MAKE_CASE(NVPTXISD::Tex1DU32Float)
+    MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DS32S32)
+    MAKE_CASE(NVPTXISD::Tex2DS32Float)
+    MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DU32S32)
+    MAKE_CASE(NVPTXISD::Tex2DU32Float)
+    MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DFloatS32)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DS32S32)
+    MAKE_CASE(NVPTXISD::Tex3DS32Float)
+    MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tex3DU32S32)
+    MAKE_CASE(NVPTXISD::Tex3DU32Float)
+    MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexCubeFloatFloat)
+    MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeS32Float)
+    MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeU32Float)
+    MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4R2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4G2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4B2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4A2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4R2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4G2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4B2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4A2DU64Float)
+
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32S32)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32Float)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32S32)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32Float)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)
+    MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)
+    MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI32Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DI64Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI32Trap)
+    MAKE_CASE(NVPTXISD::Suld3DI64Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)
+
+    MAKE_CASE(NVPTXISD::Suld1DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld2DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)
+
+    MAKE_CASE(NVPTXISD::Suld3DI8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI32Zero)
+    MAKE_CASE(NVPTXISD::Suld3DI64Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)
+    MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)
   }
   return nullptr;
+
+#undef MAKE_CASE
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -3070,8 +2883,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   // See similar issue in LowerCall.
   unsigned InsIdx = 0;
 
-  int idx = 0;
-  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
     Type *Ty = argTypes[i];
 
     if (theArgs[i]->use_empty()) {
@@ -3107,10 +2919,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       continue;
     }
 
-    // In the following cases, assign a node order of "idx+1"
+    // In the following cases, assign a node order of "i+1"
     // to newly created nodes. The SDNodes for params have to
     // appear in the same order as their order of appearance
-    // in the original function. "idx+1" holds that order.
+    // in the original function. "i+1" holds that order.
     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
       bool aggregateIsPacked = false;
       if (StructType *STy = dyn_cast<StructType>(Ty))
@@ -3125,7 +2937,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       auto VectorInfo =
           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
 
-      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      SDValue Arg = getParamSymbol(DAG, i, PtrVT);
       int VecIdx = -1; // Index of the first element of the current vector.
       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
         if (VectorInfo[parti] & PVF_FIRST) {
@@ -3159,7 +2971,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                                   MachineMemOperand::MODereferenceable |
                                       MachineMemOperand::MOInvariant);
           if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
+            P.getNode()->setIROrder(i + 1);
           for (unsigned j = 0; j < NumElts; ++j) {
             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
                                       DAG.getIntPtrConstant(j, dl));
@@ -3208,10 +3020,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
-    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+    SDValue Arg = getParamSymbol(DAG, i, PtrVT);
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
-      p.getNode()->setIROrder(idx + 1);
+      p.getNode()->setIROrder(i + 1);
     InVals.push_back(p);
   }
 
@@ -5546,10 +5358,11 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
       IsPTXVectorType(VectorVT.getSimpleVT()))
     return SDValue(); // Native vector loads already combine nicely w/
-                      // extract_vector_elt, except for v4i8.
-  // Don't mess with singletons or v2*16 types, we already handle them OK.
+                      // extract_vector_elt.
+  // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
+  // handle them OK.
   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
-      VectorVT == MVT::v4i8)
+      VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
     return SDValue();
 
   uint64_t VectorBits = VectorVT.getSizeInBits();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 365afc6bd8c6..631136ad6214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1886,10 +1886,14 @@ multiclass PRMT<ValueType T, RegisterClass RC> {
 }
 
 let hasSideEffects = false in {
-  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
+  // order is somewhat important here. signed/unsigned variants match
+  // the same patterns, so the first one wins. Having unsigned byte extraction
+  // has the benefit of always having zero in unused bits, which makes some
+  // optimizations easier (e.g. no need to mask them).
   defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
-  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
+  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
   defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
+  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
 
   defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
   defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
@@ -2259,27 +2263,69 @@ def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
 
 // comparisons of i8 extracted with BFE as i32
-def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
-def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
-def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
-def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
-
-def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
-def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
-def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
-def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
-def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
-def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
+// It's faster to do comparison directly on i32 extracted by BFE,
+// instead of the long conversion and sign extending.
+def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
+def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
+def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
+def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
+def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
+def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
+def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
+def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
+
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
 
 // i1 compare -> i32
 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
@@ -3759,7 +3805,6 @@ def CALL_PROTOTYPE :
 
 include "NVPTXIntrinsics.td"
 
-
 //-----------------------------------
 // Notes
 //-----------------------------------
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2330d7213c26..477789a164ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6372,12 +6372,17 @@ def INT_PTX_SREG_LANEMASK_GE :
 def INT_PTX_SREG_LANEMASK_GT :
     PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
 
+let hasSideEffects = 1 in {
 def INT_PTX_SREG_CLOCK :
     PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
 def INT_PTX_SREG_CLOCK64 :
     PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
 def INT_PTX_SREG_GLOBALTIMER :
     PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
+}
+
+def: Pat <(i64 (readcyclecounter)), (INT_PTX_SREG_CLOCK64)>;
+def: Pat <(i64 (readsteadycounter)), (INT_PTX_SREG_GLOBALTIMER)>;
 
 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 7d2678ae5927..29c95e4226bf 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -20,6 +20,7 @@
 
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -36,6 +37,9 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
 #include <sstream>
 #include <string>
 #define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
@@ -87,6 +91,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
   }
 
   SmallVector<Instruction *, 4> ToRemove;
+  SmallVector<Instruction *, 4> ToSimplify;
 
   // Go through the calls in this function.  Each call to __nvvm_reflect or
   // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
@@ -171,10 +176,42 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
     } else if (ReflectArg == "__CUDA_ARCH") {
       ReflectVal = SmVersion * 10;
     }
+
+    // If the immediate user is a simple comparison we want to simplify it.
+    for (User *U : Call->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        ToSimplify.push_back(I);
+
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);
   }
 
+  // The code guarded by __nvvm_reflect may be invalid for the target machine.
+  // Traverse the use-def chain, continually simplifying constant expressions
+  // until we find a terminator that we can then remove.
+  while (!ToSimplify.empty()) {
+    Instruction *I = ToSimplify.pop_back_val();
+    if (Constant *C =
+            ConstantFoldInstruction(I, F.getParent()->getDataLayout())) {
+      for (User *U : I->users())
+        if (Instruction *I = dyn_cast<Instruction>(U))
+          ToSimplify.push_back(I);
+
+      I->replaceAllUsesWith(C);
+      if (isInstructionTriviallyDead(I)) {
+        ToRemove.push_back(I);
+      }
+    } else if (I->isTerminator()) {
+      ConstantFoldTerminator(I->getParent());
+    }
+  }
+
+  // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove
+  // array. Filter out the duplicates before starting to erase from parent.
+  std::sort(ToRemove.begin(), ToRemove.end());
+  auto NewLastIter = std::unique(ToRemove.begin(), ToRemove.end());
+  ToRemove.erase(NewLastIter, ToRemove.end());
+
   for (Instruction *I : ToRemove)
     I->eraseFromParent();
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index ae02e86baf6e..262e8e56d27e 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -11,11 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVLegalizerInfo.h"
+#include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -182,7 +185,13 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
     CTPOPActions.maxScalar(0, sXLen).scalarSameSizeAs(1, 0).lower();
   }
 
-  getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF})
+  auto &ConstantActions = getActionDefinitionsBuilder(G_CONSTANT);
+  ConstantActions.legalFor({s32, p0});
+  if (ST.is64Bit())
+    ConstantActions.customFor({s64});
+  ConstantActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen);
+
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
       .legalFor({s32, sXLen, p0})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, sXLen);
@@ -451,17 +460,94 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
   return true;
 }
 
+bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm,
+                                                bool ShouldOptForSize) const {
+  assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64);
+  int64_t Imm = APImm.getSExtValue();
+  // All simm32 constants should be handled by isel.
+  // NOTE: The getMaxBuildIntsCost call below should return a value >= 2 making
+  // this check redundant, but small immediates are common so this check
+  // should have better compile time.
+  if (isInt<32>(Imm))
+    return false;
+
+  // We only need to cost the immediate, if constant pool lowering is enabled.
+  if (!STI.useConstantPoolForLargeInts())
+    return false;
+
+  RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Imm, STI);
+  if (Seq.size() <= STI.getMaxBuildIntsCost())
+    return false;
+
+  // Optimizations below are disabled for opt size. If we're optimizing for
+  // size, use a constant pool.
+  if (ShouldOptForSize)
+    return true;
+  //
+  // Special case. See if we can build the constant as (ADD (SLLI X, C), X) do
+  // that if it will avoid a constant pool.
+  // It will require an extra temporary register though.
+  // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where
+  // low and high 32 bits are the same and bit 31 and 63 are set.
+  unsigned ShiftAmt, AddOpc;
+  RISCVMatInt::InstSeq SeqLo =
+      RISCVMatInt::generateTwoRegInstSeq(Imm, STI, ShiftAmt, AddOpc);
+  return !(!SeqLo.empty() && (SeqLo.size() + 2) <= STI.getMaxBuildIntsCost());
+}
+
+// TODO: This is almost the same as LegalizerHelper::lowerFConstant and is
+// target-independent. Should we move this to LegalizeHelper?
+bool RISCVLegalizerInfo::emitLoadFromConstantPool(
+    Register DstReg, const Constant *ConstVal,
+    MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+  LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+  LLT DstLLT = MRI.getType(DstReg);
+
+  Align Alignment(DL.getABITypeAlign(getTypeForLLT(DstLLT, Ctx)));
+
+  auto Addr = MIRBuilder.buildConstantPool(
+      AddrPtrTy,
+      MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+                              MachineMemOperand::MOLoad, DstLLT, Alignment);
+
+  MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO);
+  return true;
+}
+
 bool RISCVLegalizerInfo::legalizeCustom(
     LegalizerHelper &Helper, MachineInstr &MI,
     LostDebugLocObserver &LocObserver) const {
   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   GISelChangeObserver &Observer = Helper.Observer;
+  MachineFunction &MF = *MI.getParent()->getParent();
   switch (MI.getOpcode()) {
   default:
     // No idea what to do.
     return false;
   case TargetOpcode::G_ABS:
     return Helper.lowerAbsToMaxNeg(MI);
+  // TODO: G_FCONSTANT
+  case TargetOpcode::G_CONSTANT: {
+    const Function &F = MF.getFunction();
+    // TODO: if PSI and BFI are present, add " ||
+    // llvm::shouldOptForSize(*CurMBB, PSI, BFI)".
+    bool ShouldOptForSize = F.hasOptSize() || F.hasMinSize();
+    const ConstantInt *ConstVal = MI.getOperand(1).getCImm();
+    if (!shouldBeInConstantPool(ConstVal->getValue(), ShouldOptForSize))
+      return true;
+    emitLoadFromConstantPool(MI.getOperand(0).getReg(),
+                             MI.getOperand(1).getCImm(), MIRBuilder);
+    MI.eraseFromParent();
+    return true;
+  }
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index f3ec6be16734..046555f3a3e9 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/Register.h"
 
 namespace llvm {
 
@@ -36,6 +37,9 @@ public:
                          MachineInstr &MI) const override;
 
 private:
+  bool shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const;
+  bool emitLoadFromConstantPool(Register DstReg, const Constant *CPVal,
+                                MachineIRBuilder &MIRBuilder) const;
   bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer) const;
 
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 27d52c16a4f3..575bd4c9d356 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -44,6 +44,7 @@ include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
 include "RISCVSchedSyntacoreSCR1.td"
+include "RISCVSchedXiangShanNanHu.td"
 
 //===----------------------------------------------------------------------===//
 // RISC-V processors supported.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 03e098040210..5b8d51ff6f61 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1227,3 +1227,8 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
+
+def FeatureForcedSWShadowStack : SubtargetFeature<
+    "forced-sw-shadow-stack", "HasForcedSWShadowStack", "true",
+    "Implement shadow stack with software.">;
+def HasForcedSWShadowStack : Predicate<"Subtarget->hasForcedSWShadowStack()">;
diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index 271d28f86a67..fddbaa97d063 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -47,10 +47,10 @@ public:
   StringRef getPassName() const override { return "RISC-V Fold Masks"; }
 
 private:
-  bool convertToUnmasked(MachineInstr &MI, MachineInstr *MaskDef);
-  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef);
+  bool convertToUnmasked(MachineInstr &MI, MachineInstr *MaskDef) const;
+  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef) const;
 
-  bool isAllOnesMask(MachineInstr *MaskDef);
+  bool isAllOnesMask(MachineInstr *MaskDef) const;
 };
 
 } // namespace
@@ -59,7 +59,7 @@ char RISCVFoldMasks::ID = 0;
 
 INITIALIZE_PASS(RISCVFoldMasks, DEBUG_TYPE, "RISC-V Fold Masks", false, false)
 
-bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) {
+bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) const {
   if (!MaskDef)
     return false;
   assert(MaskDef->isCopy() && MaskDef->getOperand(0).getReg() == RISCV::V0);
@@ -89,7 +89,8 @@ bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) {
 
 // Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
 // (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
-bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI, MachineInstr *V0Def) {
+bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI,
+                                        MachineInstr *V0Def) const {
 #define CASE_VMERGE_TO_VMV(lmul)                                               \
   case RISCV::PseudoVMERGE_VVM_##lmul:                                         \
     NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                      \
@@ -133,7 +134,7 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI, MachineInstr *V0Def) {
 }
 
 bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI,
-                                       MachineInstr *MaskDef) {
+                                       MachineInstr *MaskDef) const {
   const RISCV::RISCVMaskedPseudoInfo *I =
       RISCV::getMaskedPseudoInfo(MI.getOpcode());
   if (!I)
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e5b510349e3d..37672dd047f2 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -36,13 +36,13 @@ static Align getABIStackAlignment(RISCVABI::ABI ABI) {
 }
 
 RISCVFrameLowering::RISCVFrameLowering(const RISCVSubtarget &STI)
-    : TargetFrameLowering(StackGrowsDown,
-                          getABIStackAlignment(STI.getTargetABI()),
-                          /*LocalAreaOffset=*/0,
-                          /*TransientStackAlignment=*/Align(16)),
+    : TargetFrameLowering(
+          StackGrowsDown, getABIStackAlignment(STI.getTargetABI()),
+          /*LocalAreaOffset=*/0,
+          /*TransientStackAlignment=*/getABIStackAlignment(STI.getTargetABI())),
       STI(STI) {}
 
-static const Register AllPopRegs[] = {
+static const MCPhysReg AllPopRegs[] = {
     RISCV::X1,  RISCV::X8,  RISCV::X9,  RISCV::X18, RISCV::X19,
     RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23, RISCV::X24,
     RISCV::X25, RISCV::X26, RISCV::X27};
@@ -66,9 +66,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
           CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) {
+    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg);
+    return;
+  }
+
   Register SCSPReg = RISCVABI::getSCSPReg();
 
-  const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
   int64_t SlotSize = STI.getXLen() / 8;
   // Store return address to shadow call stack
@@ -121,9 +126,14 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
           CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
     return;
 
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  if (!STI.hasForcedSWShadowStack() && STI.hasStdExtZicfiss()) {
+    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg);
+    return;
+  }
+
   Register SCSPReg = RISCVABI::getSCSPReg();
 
-  const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
   int64_t SlotSize = STI.getXLen() / 8;
   // Load return address from shadow call stack
@@ -1003,9 +1013,7 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
     };
 
     for (auto Reg : CSRegs)
-      // Only save x0-x15 for RVE.
-      if (Reg < RISCV::X16 || !Subtarget.isRVE())
-        SavedRegs.set(Reg);
+      SavedRegs.set(Reg);
 
     // According to psABI, if ilp32e/lp64e ABIs are used with an ISA that
     // has any of the registers x16-x31 and f0-f31, then these registers are
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27037f4d5c5c..73492c28815b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2173,19 +2173,34 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   if (ResVT.isScalableVector() || SrcVT.isScalableVector())
     return false;
 
+  EVT EltVT = ResVT.getVectorElementType();
+  assert(EltVT == SrcVT.getVectorElementType() && "Should hold for node");
+
+  // The smallest type we can slide is i8.
+  // TODO: We can extract index 0 from a mask vector without a slide.
+  if (EltVT == MVT::i1)
+    return false;
+
   unsigned ResElts = ResVT.getVectorNumElements();
   unsigned SrcElts = SrcVT.getVectorNumElements();
 
+  unsigned MinVLen = Subtarget.getRealMinVLen();
+  unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits();
+
+  // If we're extracting only data from the first VLEN bits of the source
+  // then we can always do this with an m1 vslidedown.vx.  Restricting the
+  // Index ensures we can use a vslidedown.vi.
+  // TODO: We can generalize this when the exact VLEN is known.
+  if (Index + ResElts <= MinVLMAX && Index < 31)
+    return true;
+
   // Convervatively only handle extracting half of a vector.
-  // TODO: Relax this.
+  // TODO: For sizes which aren't multiples of VLEN sizes, this may not be
+  // a cheap extract.  However, this case is important in practice for
+  // shuffled extracts of longer vectors.  How resolve?
   if ((ResElts * 2) != SrcElts)
     return false;
 
-  // The smallest type we can slide is i8.
-  // TODO: We can extract index 0 from a mask vector without a slide.
-  if (ResVT.getVectorElementType() == MVT::i1)
-    return false;
-
   // Slide can support arbitrary index, but we only treat vslidedown.vi as
   // cheap.
   if (Index >= 32)
@@ -9712,12 +9727,15 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   if (OrigIdx == 0)
     return Op;
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group down the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  const unsigned MinVLen = Subtarget.getRealMinVLen();
+  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group down the full amount.
+  if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9749,19 +9767,48 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), Slidedown);
   }
 
+  if (VecVT.isFixedLengthVector()) {
+    VecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(VecVT, Vec, DAG, Subtarget);
+  }
+
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector())
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+
   unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  // extract_subvector scales the index by vscale is the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(MinVLen == MaxVLen);
+    unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+    std::tie(SubRegIdx, RemIdx) =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    RemIdx = (RemIdx * Vscale) + (OrigIdx % Vscale);
+  } else {
+    std::tie(SubRegIdx, RemIdx) =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            VecVT, ContainerSubVecVT, OrigIdx, TRI);
+  }
 
   // If the Idx has been completely eliminated then this is a subvector extract
   // which naturally aligns to a vector register. These can easily be handled
   // using subregister manipulation.
-  if (RemIdx == 0)
+  if (RemIdx == 0) {
+    if (SubVecVT.isFixedLengthVector()) {
+      Vec = DAG.getTargetExtractSubreg(SubRegIdx, DL, ContainerSubVecVT, Vec);
+      return convertFromScalableVector(SubVecVT, Vec, DAG, Subtarget);
+    }
     return Op;
+  }
 
-  // Else SubVecVT is a fractional LMUL and may need to be slid down.
-  assert(RISCVVType::decodeVLMUL(getLMUL(SubVecVT)).second);
+  // Else SubVecVT is M1 or smaller and may need to be slid down: if SubVecVT
+  // was > M1 then the index would need to be a multiple of VLMAX, and so would
+  // divide exactly.
+  assert(RISCVVType::decodeVLMUL(getLMUL(ContainerSubVecVT)).second ||
+         getLMUL(ContainerSubVecVT) == RISCVII::VLMUL::LMUL_1);
 
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
@@ -9776,10 +9823,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
 
   // Slide this vector register down by the desired number of elements in order
   // to place the desired subvector starting at element 0.
-  SDValue SlidedownAmt =
-      DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+  SDValue SlidedownAmt;
+  if (SubVecVT.isFixedLengthVector())
+    SlidedownAmt = DAG.getConstant(RemIdx, DL, Subtarget.getXLenVT());
+  else
+    SlidedownAmt =
+        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
 
   auto [Mask, VL] = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget);
+  if (SubVecVT.isFixedLengthVector())
+    VL = getVLOp(SubVecVT.getVectorNumElements(), InterSubVT, DL, DAG,
+                 Subtarget);
   SDValue Slidedown =
       getVSlidedown(DAG, Subtarget, DL, InterSubVT, DAG.getUNDEF(InterSubVT),
                     Vec, SlidedownAmt, Mask, VL);
@@ -14956,6 +15010,11 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
   if (!TLI.isOperationLegalOrCustom(Opcode, VT) || !TLI.isTypeLegal(VT))
     return SDValue();
 
+  // This BUILD_VECTOR involves an implicit truncation, and sinking
+  // truncates through binops is non-trivial.
+  if (N->op_begin()->getValueType() != VT.getVectorElementType())
+    return SDValue();
+
   SmallVector<SDValue> LHSOps;
   SmallVector<SDValue> RHSOps;
   for (SDValue Op : N->ops()) {
@@ -14983,6 +15042,7 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
     // have different LHS and RHS types.
     if (Op.getOperand(0).getValueType() != Op.getOperand(1).getValueType())
       return SDValue();
+
     RHSOps.push_back(Op.getOperand(1));
   }
 
@@ -15812,7 +15872,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
           MGN->getMemOperand(), IndexType, MGN->getExtensionType());
 
     if (Index.getOpcode() == ISD::BUILD_VECTOR &&
-        MGN->getExtensionType() == ISD::NON_EXTLOAD) {
+        MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) {
       if (std::optional<VIDSequence> SimpleVID = isSimpleVIDSequence(Index);
           SimpleVID && SimpleVID->StepDenominator == 1) {
         const int64_t StepNumerator = SimpleVID->StepNumerator;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 89eb71d91742..225a9db8f3ee 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1115,7 +1115,7 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   // FIXME: A virtual register must be used initially, as the register
   // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
   // uses the same workaround).
-  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRJALRRegClass);
   auto II = MBB.end();
   // We may also update the jump target to RestoreBB later.
   MachineInstr &MI = *BuildMI(MBB, II, DL, get(RISCV::PseudoJump))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 518982441e7c..7fe9b626b66d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1260,14 +1260,14 @@ def : PatGprSimm12<or_is_add, ADDI>;
 // negate of low bit can be done via two (compressible) shifts.  The negate
 // is never compressible since rs1 and rd can't be the same register.
 def : Pat<(XLenVT (sub 0, (and_oneuse GPR:$rs, 1))),
-          (SRAI (SLLI $rs, (ImmSubFromXLen (XLenVT 1))),
+          (SRAI (XLenVT (SLLI $rs, (ImmSubFromXLen (XLenVT 1)))),
                 (ImmSubFromXLen (XLenVT 1)))>;
 
 // AND with leading/trailing ones mask exceeding simm32/simm12.
 def : Pat<(i64 (and GPR:$rs, LeadingOnesMask:$mask)),
-          (SLLI (SRLI $rs, LeadingOnesMask:$mask), LeadingOnesMask:$mask)>;
+          (SLLI (i64 (SRLI $rs, LeadingOnesMask:$mask)), LeadingOnesMask:$mask)>;
 def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)),
-          (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>;
+          (SRLI (XLenVT (SLLI $rs, TrailingOnesMask:$mask)), TrailingOnesMask:$mask)>;
 
 // Match both a plain shift and one where the shift amount is masked (this is
 // typically introduced when the legalizer promotes the shift amount and
@@ -1380,7 +1380,7 @@ defm Select_GPR : SelectCC_GPR_rrirr<GPR, XLenVT>;
 class SelectCompressOpt<CondCode Cond>
     : Pat<(riscv_selectcc_frag:$select (XLenVT GPR:$lhs), simm12_no6:$Constant, Cond,
                                        (XLenVT GPR:$truev), GPR:$falsev),
-    (Select_GPR_Using_CC_GPR (ADDI GPR:$lhs, (NegImm simm12:$Constant)), (XLenVT X0),
+    (Select_GPR_Using_CC_GPR (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))), (XLenVT X0),
                           (IntCCtoRISCVCC $select), GPR:$truev, GPR:$falsev)>;
 
 def OptForMinSize : Predicate<"MF ? MF->getFunction().hasMinSize() : false">;
@@ -1728,12 +1728,12 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 /// RV64 patterns
 
 let Predicates = [IsRV64, NotHasStdExtZba] in {
-def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+def : Pat<(i64 (and GPR:$rs1, 0xffffffff)), (SRLI (i64 (SLLI GPR:$rs1, 32)), 32)>;
 
 // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
 // shifts instead of 3. This can occur when unsigned is used to index an array.
 def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
-          (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+          (SRLI (i64 (SLLI GPR:$rs1, 32)), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
 class binop_allhusers<SDPatternOperator operator>
@@ -1768,7 +1768,7 @@ def u32simm12 : ImmLeaf<XLenVT, [{
 let Predicates = [IsRV64] in {
 
 def : Pat<(i64 (and GPR:$rs, LeadingOnesWMask:$mask)),
-          (SLLI (SRLIW $rs, LeadingOnesWMask:$mask), LeadingOnesWMask:$mask)>;
+          (SLLI (i64 (SRLIW $rs, LeadingOnesWMask:$mask)), LeadingOnesWMask:$mask)>;
 
 /// sext and zext
 
@@ -1864,13 +1864,13 @@ def KCFI_CHECK
 
 /// Simple optimization
 def : Pat<(XLenVT (add GPR:$rs1, (AddiPair:$rs2))),
-          (ADDI (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+          (ADDI (XLenVT (ADDI GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))),
                 (AddiPairImmSmall GPR:$rs2))>;
 
 let Predicates = [IsRV64] in {
 // Select W instructions if only the lower 32-bits of the result are used.
 def : Pat<(binop_allwusers<add> GPR:$rs1, (AddiPair:$rs2)),
-          (ADDIW (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2)),
+          (ADDIW (i64 (ADDIW GPR:$rs1, (AddiPairImmLarge AddiPair:$rs2))),
                  (AddiPairImmSmall AddiPair:$rs2))>;
 }
 
@@ -1929,7 +1929,7 @@ def : PatGprImm<srl, SRLIW, uimm5, i32>;
 def : PatGprImm<sra, SRAIW, uimm5, i32>;
 
 def : Pat<(i32 (and GPR:$rs, TrailingOnesMask:$mask)),
-          (SRLI (SLLI $rs, (i64 (XLenSubTrailingOnes $mask))),
+          (SRLI (i32 (SLLI $rs, (i64 (XLenSubTrailingOnes $mask)))),
                 (i64 (XLenSubTrailingOnes $mask)))>;
 
 // Use sext if the sign bit of the input is 0.
@@ -1937,12 +1937,12 @@ def : Pat<(zext_is_sext GPR:$src), (ADDIW GPR:$src, 0)>;
 }
 
 let Predicates = [IsRV64, NotHasStdExtZba] in {
-def : Pat<(zext GPR:$src), (SRLI (SLLI GPR:$src, 32), 32)>;
+def : Pat<(zext GPR:$src), (SRLI (i64 (SLLI GPR:$src, 32)), 32)>;
 
 // If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
 // shifts instead of 3. This can occur when unsigned is used to index an array.
 def : Pat<(shl (zext GPR:$rs), uimm5:$shamt),
-          (SRLI (SLLI GPR:$rs, 32), (ImmSubFrom32 uimm5:$shamt))>;
+          (SRLI (i64 (SLLI GPR:$rs, 32)), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index fec43d814098..9b4f93d55e33 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -410,11 +410,11 @@ foreach Ext = DExts in {
 let Predicates = [HasStdExtD] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETEQ)),
-          (AND (FLE_D $rs1, $rs2),
-               (FLE_D $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D $rs1, $rs2)),
+               (XLenVT (FLE_D $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETOEQ)),
-          (AND (FLE_D $rs1, $rs2),
-               (FLE_D $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D $rs1, $rs2)),
+               (XLenVT (FLE_D $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETEQ)),
           (FLE_D $rs1, $rs1)>;
@@ -430,11 +430,11 @@ def : PatSetCC<FPR64, any_fsetccs, SETOLE, FLE_D, f64>;
 let Predicates = [HasStdExtZdinx, IsRV64] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETEQ)),
-          (AND (FLE_D_INX $rs1, $rs2),
-               (FLE_D_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
+               (XLenVT (FLE_D_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs2, SETOEQ)),
-          (AND (FLE_D_INX $rs1, $rs2),
-               (FLE_D_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_INX $rs1, $rs2)),
+               (XLenVT (FLE_D_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs (f64 FPR64INX:$rs1), FPR64INX:$rs1, SETEQ)),
           (FLE_D_INX $rs1, $rs1)>;
@@ -450,11 +450,11 @@ def : PatSetCC<FPR64INX, any_fsetccs, SETOLE, FLE_D_INX, f64>;
 let Predicates = [HasStdExtZdinx, IsRV32] in {
 // Match signaling FEQ_D
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETEQ)),
-          (AND (FLE_D_IN32X $rs1, $rs2),
-               (FLE_D_IN32X $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
+               (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETOEQ)),
-          (AND (FLE_D_IN32X $rs1, $rs2),
-               (FLE_D_IN32X $rs2, $rs1))>;
+          (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
+               (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 52eadbdec255..7d89608de122 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -617,11 +617,11 @@ foreach Ext = FExts in {
 let Predicates = [HasStdExtF] in {
 // Match signaling FEQ_S
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETEQ)),
-          (AND (FLE_S $rs1, $rs2),
-               (FLE_S $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S $rs1, $rs2)),
+               (XLenVT (FLE_S $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETOEQ)),
-          (AND (FLE_S $rs1, $rs2),
-               (FLE_S $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S $rs1, $rs2)),
+               (XLenVT (FLE_S $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETEQ)),
           (FLE_S $rs1, $rs1)>;
@@ -632,11 +632,11 @@ def : Pat<(XLenVT (strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETOEQ)),
 let Predicates = [HasStdExtZfinx] in {
 // Match signaling FEQ_S
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETEQ)),
-          (AND (FLE_S_INX $rs1, $rs2),
-               (FLE_S_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S_INX $rs1, $rs2)),
+               (XLenVT (FLE_S_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs2, SETOEQ)),
-          (AND (FLE_S_INX $rs1, $rs2),
-               (FLE_S_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_S_INX $rs1, $rs2)),
+               (XLenVT (FLE_S_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR32INX:$rs1, FPR32INX:$rs1, SETEQ)),
           (FLE_S_INX $rs1, $rs1)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index f9890ca4b0ee..6b43d4393f76 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -112,7 +112,7 @@ let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
 // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
 // zeroing the upper 32 bits.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
-          (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
+          (MULHU (i64 (SLLI GPR:$rs1, 32)), (i64 (SLLI GPR:$rs2, 32)))>;
 } // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index ff474e4616bd..79ced3864363 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -548,65 +548,66 @@ def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
 
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 1)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 1)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 2)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 1), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 3)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 2), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 3)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (TH_ADDSL GPR:$rs1, GPR:$rs1, 3), 3)>;
+          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 3)>;
 
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (TH_ADDSL GPR:$r, (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>;
 
 def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 1),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)),
                 (TrailingZeros C3LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 2),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
                 (TrailingZeros C5LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (TH_ADDSL GPR:$r, GPR:$r, 3),
+          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)),
                 (TrailingZeros C9LeftShift:$i))>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 1)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 1), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2), (TH_ADDSL GPR:$r, GPR:$r, 2), 2)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
+                    (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 2), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 3)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)),
-          (TH_ADDSL GPR:$r, (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>;
+          (TH_ADDSL GPR:$r, (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 1)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 1)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 2)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 2)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
-          (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 3), (TH_ADDSL GPR:$r, GPR:$r, 3), 3)>;
+          (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)), 3)>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)),
-          (SLLI (TH_ADDSL (TH_ADDSL GPR:$r, GPR:$r, 2),
-                          (TH_ADDSL GPR:$r, GPR:$r, 2), 2), 3)>;
+          (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
+                                  (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>;
 } // Predicates = [HasVendorXTHeadBa]
 
 let Predicates = [HasVendorXTHeadBb] in {
@@ -633,14 +634,14 @@ def : Pat<(sra (bswap i64:$rs1), (i64 32)),
 def : Pat<(binop_allwusers<srl> (bswap i64:$rs1), (i64 32)),
           (TH_REVW i64:$rs1)>;
 def : Pat<(riscv_clzw i64:$rs1),
-          (TH_FF0 (SLLI (XORI i64:$rs1, -1), 32))>;
+          (TH_FF0 (i64 (SLLI (i64 (XORI i64:$rs1, -1)), 32)))>;
 } // Predicates = [HasVendorXTHeadBb, IsRV64]
 
 let Predicates = [HasVendorXTHeadBs] in {
 def : Pat<(and (srl (XLenVT GPR:$rs1), uimmlog2xlen:$shamt), 1),
           (TH_TST GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(XLenVT (seteq (and (XLenVT GPR:$rs1), SingleBitSetMask:$mask), 0)),
-          (TH_TST (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>;
+          (TH_TST (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>;
 } // Predicates = [HasVendorXTHeadBs]
 
 let Predicates = [HasVendorXTHeadCondMov] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index f8938c0a98d1..f0f8494dd9a3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -555,7 +555,7 @@ def : Pat<(XLenVT (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
 def : Pat<(XLenVT (shiftop<shl> 1, (XLenVT GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
 def : Pat<(XLenVT (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
-          (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
+          (ADDI (XLenVT (BSET (XLenVT X0), GPR:$rs2)), -1)>;
 
 def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)),
           (BCLRI GPR:$rs1, BCLRMask:$mask)>;
@@ -568,25 +568,25 @@ def : Pat<(XLenVT (and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1))),
           (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
 
 def : Pat<(XLenVT (seteq (XLenVT (and GPR:$rs1, SingleBitSetMask:$mask)), 0)),
-          (BEXTI (XORI GPR:$rs1, -1), SingleBitSetMask:$mask)>;
+          (BEXTI (XLenVT (XORI GPR:$rs1, -1)), SingleBitSetMask:$mask)>;
 
 def : Pat<(XLenVT (or GPR:$r, BSETINVTwoBitsMask:$i)),
-          (BSETI (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)),
+          (BSETI (XLenVT (BSETI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
 def : Pat<(XLenVT (xor GPR:$r, BSETINVTwoBitsMask:$i)),
-          (BINVI (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i)),
+          (BINVI (XLenVT (BINVI GPR:$r, (TrailingZeros BSETINVTwoBitsMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
 def : Pat<(XLenVT (or GPR:$r, BSETINVORIMask:$i)),
-          (BSETI (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+          (BSETI (XLenVT (ORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
 def : Pat<(XLenVT (xor GPR:$r, BSETINVORIMask:$i)),
-          (BINVI (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i)),
+          (BINVI (XLenVT (XORI GPR:$r, (BSETINVORIMaskLow BSETINVORIMask:$i))),
                  (BSETINVTwoBitsMaskHigh BSETINVORIMask:$i))>;
 def : Pat<(XLenVT (and GPR:$r, BCLRITwoBitsMask:$i)),
-          (BCLRI (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i)),
+          (BCLRI (XLenVT (BCLRI GPR:$r, (BCLRITwoBitsMaskLow BCLRITwoBitsMask:$i))),
                  (BCLRITwoBitsMaskHigh BCLRITwoBitsMask:$i))>;
 def : Pat<(XLenVT (and GPR:$r, BCLRIANDIMask:$i)),
-          (BCLRI (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i)),
+          (BCLRI (XLenVT (ANDI GPR:$r, (BCLRIANDIMaskLow BCLRIANDIMask:$i))),
                  (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>;
 } // Predicates = [HasStdExtZbs]
 
@@ -614,7 +614,7 @@ def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 
 def : Pat<(i64 (riscv_absw GPR:$rs1)),
-          (MAX GPR:$rs1, (SUBW (XLenVT X0), GPR:$rs1))>;
+          (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
 let Predicates = [HasStdExtZbb] in {
@@ -686,63 +686,66 @@ foreach i = {1,2,3} in {
 }
 
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (SH1ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (SH1ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (SH2ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (SH2ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (SH2ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (SH3ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (SH3ADD (SH2ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
 
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (SH2ADD (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i)),
+          (SH2ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))),
                   GPR:$r)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (SH3ADD (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i)),
+          (SH3ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))),
                   GPR:$r)>;
 
 def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (SH1ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH1ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C3LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (SH2ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH2ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C5LeftShift:$i))>;
 def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (SH3ADD GPR:$r, GPR:$r),
+          (SLLI (XLenVT (SH3ADD GPR:$r, GPR:$r)),
                 (TrailingZeros C9LeftShift:$i))>;
 
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)),
-          (SH1ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH1ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)),
-          (SH1ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)),
-          (SH2ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)),
-          (SH2ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)),
-          (SH2ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)),
-          (SH3ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH1ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)),
-          (SH3ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH2ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)),
-          (SH3ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)), GPR:$r)>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)),
-          (SH1ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH1ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)),
-          (SH2ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH2ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
-          (SH3ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+          (SH3ADD (XLenVT (SH3ADD GPR:$r, GPR:$r)),
+                  (XLenVT (SH3ADD GPR:$r, GPR:$r)))>;
 } // Predicates = [HasStdExtZba]
 
 let Predicates = [HasStdExtZba, IsRV64] in {
@@ -751,7 +754,7 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)),
 // Match a shifted 0xffffffff mask. Use SRLI to clear the LSBs and SLLI_UW to
 // mask and shift.
 def : Pat<(i64 (and GPR:$rs1, Shifted32OnesMask:$mask)),
-          (SLLI_UW (SRLI GPR:$rs1, Shifted32OnesMask:$mask),
+          (SLLI_UW (XLenVT (SRLI GPR:$rs1, Shifted32OnesMask:$mask)),
                    Shifted32OnesMask:$mask)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFF), GPR:$rs2)),
           (ADD_UW GPR:$rs1, GPR:$rs2)>;
@@ -781,29 +784,29 @@ foreach i = {1,2,3} in {
 }
 
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFE), (XLenVT GPR:$rs2))),
-          (SH1ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
+          (SH1ADD (XLenVT (SRLIW GPR:$rs1, 1)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFFC), (XLenVT GPR:$rs2))),
-          (SH2ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>;
+          (SH2ADD (XLenVT (SRLIW GPR:$rs1, 2)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0xFFFFFFF8), (XLenVT GPR:$rs2))),
-          (SH3ADD (SRLIW GPR:$rs1, 3), GPR:$rs2)>;
+          (SH3ADD (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>;
 
 // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x1FFFFFFFE), (XLenVT GPR:$rs2))),
-          (SH1ADD_UW (SRLI GPR:$rs1, 1), GPR:$rs2)>;
+          (SH1ADD_UW (XLenVT (SRLI GPR:$rs1, 1)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x3FFFFFFFC), (XLenVT GPR:$rs2))),
-          (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>;
+          (SH2ADD_UW (XLenVT (SRLI GPR:$rs1, 2)), GPR:$rs2)>;
 def : Pat<(i64 (add_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))),
-          (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>;
+          (SH3ADD_UW (XLenVT (SRLI GPR:$rs1, 3)), GPR:$rs2)>;
 
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C3LeftShiftUW:$i)),
-          (SH1ADD (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i)))>;
+          (SH1ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))))>;
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C5LeftShiftUW:$i)),
-          (SH2ADD (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i)))>;
+          (SH2ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))))>;
 def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C9LeftShiftUW:$i)),
-          (SH3ADD (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)),
-                  (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i)))>;
+          (SH3ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))),
+                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))))>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
@@ -888,23 +891,23 @@ foreach i = {1,2,3} in {
 }
 
 let Predicates = [HasStdExtZbs, IsRV64] in {
-def : Pat<(i32 (and (not (shiftop<shl> 1, (XLenVT GPR:$rs2))), GPR:$rs1)),
+def : Pat<(i32 (and (not (shiftop<shl> 1, (i64 GPR:$rs2))), GPR:$rs1)),
           (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (and (rotl -2, (i64 GPR:$rs2)), GPR:$rs1)),
           (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (or (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (or (shiftop<shl> 1, (i64 GPR:$rs2)), GPR:$rs1)),
           (BSET GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (xor (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
+def : Pat<(i32 (xor (shiftop<shl> 1, (i64 GPR:$rs2)), GPR:$rs1)),
           (BINV GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
+def : Pat<(i32 (and (shiftop<srl> GPR:$rs1, (i64 GPR:$rs2)), 1)),
           (BEXT GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i64 (and (anyext (i32 (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)))), 1)),
+def : Pat<(i64 (and (anyext (i32 (shiftop<srl> GPR:$rs1, (i64 GPR:$rs2)))), 1)),
           (BEXT GPR:$rs1, GPR:$rs2)>;
 
-def : Pat<(i32 (shiftop<shl> 1, (XLenVT GPR:$rs2))),
+def : Pat<(i32 (shiftop<shl> 1, (i64 GPR:$rs2))),
           (BSET (XLenVT X0), GPR:$rs2)>;
-def : Pat<(i32 (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
-          (ADDI (BSET (XLenVT X0), GPR:$rs2), -1)>;
+def : Pat<(i32 (not (shiftop<shl> -1, (i64 GPR:$rs2)))),
+          (ADDI (i32 (BSET (XLenVT X0), GPR:$rs2)), -1)>;
 
 def : Pat<(i32 (and (srl GPR:$rs1, uimm5:$shamt), (i32 1))),
           (BEXTI GPR:$rs1, uimm5:$shamt)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 2e0f754cdf7c..e0f1c7154834 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -366,11 +366,11 @@ foreach Ext = ZfhExts in {
 let Predicates = [HasStdExtZfh] in {
 // Match signaling FEQ_H
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETEQ)),
-          (AND (FLE_H $rs1, $rs2),
-               (FLE_H $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H $rs1, $rs2)),
+               (XLenVT (FLE_H $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), FPR16:$rs2, SETOEQ)),
-          (AND (FLE_H $rs1, $rs2),
-               (FLE_H $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H $rs1, $rs2)),
+               (XLenVT (FLE_H $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETEQ)),
           (FLE_H $rs1, $rs1)>;
@@ -381,11 +381,11 @@ def : Pat<(XLenVT (strict_fsetccs (f16 FPR16:$rs1), (f16 FPR16:$rs1), SETOEQ)),
 let Predicates = [HasStdExtZhinx] in {
 // Match signaling FEQ_H
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETEQ)),
-          (AND (FLE_H_INX $rs1, $rs2),
-               (FLE_H_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H_INX $rs1, $rs2)),
+               (XLenVT (FLE_H_INX $rs2, $rs1)))>;
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs2, SETOEQ)),
-          (AND (FLE_H_INX $rs1, $rs2),
-               (FLE_H_INX $rs2, $rs1))>;
+          (AND (XLenVT (FLE_H_INX $rs1, $rs2)),
+               (XLenVT (FLE_H_INX $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
 def : Pat<(XLenVT (strict_fsetccs FPR16INX:$rs1, FPR16INX:$rs1, SETEQ)),
           (FLE_H_INX $rs1, $rs1)>;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 59bb811058d4..8c75df41f5e3 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -330,7 +330,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                               TuneLDADDFusion]>;
 
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
-                                          NoSchedModel,
+                                          XiangShanNanHuModel,
                                           [Feature64Bit,
                                            FeatureStdExtZicsr,
                                            FeatureStdExtZifencei,
@@ -348,4 +348,8 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                            FeatureStdExtZksh,
                                            FeatureStdExtSvinval,
                                            FeatureStdExtZicbom,
-                                           FeatureStdExtZicboz]>;
+                                           FeatureStdExtZicboz],
+                                           [TuneNoDefaultUnroll,
+                                            TuneZExtHFusion,
+                                            TuneZExtWFusion,
+                                            TuneShiftedZExtWFusion]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
new file mode 100644
index 000000000000..667b5983cb40
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -0,0 +1,308 @@
+//==- RISCVSchedXiangShanNanHu.td - XS-NanHu Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// XiangShan is a high-performance open-source RISC-V processor developed by
+// the Institute of Computing Technology (ICT), Chinese Academy of Sciences.
+// Source: https://github.com/OpenXiangShan/XiangShan
+// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
+
+// XiangShan-NanHu is the second generation of XiangShan processor series.
+// Overview: https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/
+
+def XiangShanNanHuModel : SchedMachineModel {
+  let MicroOpBufferSize = 256;
+  let LoopMicroOpBufferSize = 48;  // Instruction queue size
+  let IssueWidth = 6;  // 6-way decode and dispatch
+  let LoadLatency = 4;
+  let MispredictPenalty = 11; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
+  let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr, HasVInstructions,
+                             HasVInstructionsI64];
+}
+
+let SchedModel = XiangShanNanHuModel in {
+
+// The reservation stations are distributed and grouped as 32-entry or 16-entry smaller ones.
+let BufferSize = 16 in {
+  def XS2ALU : ProcResource<4>;
+  def XS2MDU : ProcResource<2>;
+  def XS2MISC : ProcResource<1>;
+
+  def XS2FMAC : ProcResource<4>;
+  def XS2FMISC : ProcResource<2>;
+
+  // Load/Store queues are ignored.
+  def XS2LD : ProcResource<2>;
+  def XS2ST : ProcResource<2>;
+}
+
+// Branching
+def : WriteRes<WriteJmp, [XS2MISC]>;
+def : WriteRes<WriteJal, [XS2MISC]>;
+def : WriteRes<WriteJalr, [XS2MISC]>;
+
+// Integer arithmetic and logic
+let Latency = 1 in {
+def : WriteRes<WriteIALU, [XS2ALU]>;
+def : WriteRes<WriteIALU32, [XS2ALU]>;
+def : WriteRes<WriteShiftImm, [XS2ALU]>;
+def : WriteRes<WriteShiftImm32, [XS2ALU]>;
+def : WriteRes<WriteShiftReg, [XS2ALU]>;
+def : WriteRes<WriteShiftReg32, [XS2ALU]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [XS2MDU]>;
+def : WriteRes<WriteIMul32, [XS2MDU]>;
+}
+
+// Integer division
+// SRT16 algorithm
+let Latency = 20, ReleaseAtCycles = [20] in {
+def : WriteRes<WriteIDiv32, [XS2MDU]>;
+def : WriteRes<WriteIDiv, [XS2MDU]>;
+}
+
+// Zb*
+let Latency = 1 in {
+// Zba
+def : WriteRes<WriteSHXADD, [XS2ALU]>;
+def : WriteRes<WriteSHXADD32, [XS2ALU]>;
+
+// Zbb
+def : WriteRes<WriteRotateImm, [XS2ALU]>;
+def : WriteRes<WriteRotateImm32, [XS2ALU]>;
+def : WriteRes<WriteRotateReg, [XS2ALU]>;
+def : WriteRes<WriteRotateReg32, [XS2ALU]>;
+def : WriteRes<WriteORCB, [XS2ALU]>;
+def : WriteRes<WriteREV8, [XS2ALU]>;
+
+// Zbkb
+def : WriteRes<WriteBREV8, [XS2ALU]>;
+def : WriteRes<WritePACK, [XS2ALU]>;
+def : WriteRes<WritePACK32, [XS2ALU]>;
+def : WriteRes<WriteZIP, [XS2ALU]>;
+
+// Zbs
+def : WriteRes<WriteSingleBit, [XS2ALU]>;
+def : WriteRes<WriteSingleBitImm, [XS2ALU]>;
+def : WriteRes<WriteBEXT, [XS2ALU]>;
+def : WriteRes<WriteBEXTI, [XS2ALU]>;
+}
+
+let Latency = 3 in {
+// Zbb
+def : WriteRes<WriteCLZ, [XS2MDU]>;
+def : WriteRes<WriteCLZ32, [XS2MDU]>;
+def : WriteRes<WriteCTZ, [XS2MDU]>;
+def : WriteRes<WriteCTZ32, [XS2MDU]>;
+def : WriteRes<WriteCPOP, [XS2MDU]>;
+def : WriteRes<WriteCPOP32, [XS2MDU]>;
+
+// Zbkc
+def : WriteRes<WriteCLMUL, [XS2MDU]>;
+
+// Zbkx
+def : WriteRes<WriteXPERM, [XS2MDU]>;
+}
+
+// Memory
+def : WriteRes<WriteSTB, [XS2ST]>;
+def : WriteRes<WriteSTH, [XS2ST]>;
+def : WriteRes<WriteSTW, [XS2ST]>;
+def : WriteRes<WriteSTD, [XS2ST]>;
+def : WriteRes<WriteFST32, [XS2ST]>;
+def : WriteRes<WriteFST64, [XS2ST]>;
+def : WriteRes<WriteAtomicSTW, [XS2ST]>;
+def : WriteRes<WriteAtomicSTD, [XS2ST]>;
+
+let Latency = 5 in {
+def : WriteRes<WriteLDB, [XS2LD]>;
+def : WriteRes<WriteLDH, [XS2LD]>;
+def : WriteRes<WriteLDW, [XS2LD]>;
+def : WriteRes<WriteLDD, [XS2LD]>;
+
+def : WriteRes<WriteAtomicW, [XS2LD]>;
+def : WriteRes<WriteAtomicD, [XS2LD]>;
+def : WriteRes<WriteAtomicLDW, [XS2LD]>;
+def : WriteRes<WriteAtomicLDD, [XS2LD]>;
+
+def : WriteRes<WriteFLD32, [XS2LD]>;
+def : WriteRes<WriteFLD64, [XS2LD]>;
+}
+
+// XiangShan-NanHu uses FuDian FPU instead of Berkeley HardFloat.
+// Documentation: https://github.com/OpenXiangShan/fudian
+
+let Latency = 3 in {
+def : WriteRes<WriteFAdd32, [XS2FMAC]>;
+def : WriteRes<WriteFSGNJ32, [XS2FMAC]>;
+def : WriteRes<WriteFMinMax32, [XS2FMAC]>;
+def : WriteRes<WriteFAdd64, [XS2FMAC]>;
+def : WriteRes<WriteFSGNJ64, [XS2FMAC]>;
+def : WriteRes<WriteFMinMax64, [XS2FMAC]>;
+
+def : WriteRes<WriteFCvtI32ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI32ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI64ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtI64ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF32ToF64, [XS2FMAC]>;
+def : WriteRes<WriteFCvtF64ToF32, [XS2FMAC]>;
+
+def : WriteRes<WriteFClass32, [XS2FMAC]>;
+def : WriteRes<WriteFClass64, [XS2FMAC]>;
+def : WriteRes<WriteFCmp32, [XS2FMAC]>;
+def : WriteRes<WriteFCmp64, [XS2FMAC]>;
+def : WriteRes<WriteFMovF32ToI32, [XS2FMAC]>;
+def : WriteRes<WriteFMovI32ToF32, [XS2FMAC]>;
+def : WriteRes<WriteFMovF64ToI64, [XS2FMAC]>;
+def : WriteRes<WriteFMovI64ToF64, [XS2FMAC]>;
+}
+
+// FP multiplication
+let Latency = 3 in {
+def : WriteRes<WriteFMul32, [XS2FMAC]>;
+def : WriteRes<WriteFMul64, [XS2FMAC]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMA32, [XS2FMAC]>;
+def : WriteRes<WriteFMA64, [XS2FMAC]>;
+}
+
+// FP division
+def : WriteRes<WriteFDiv32, [XS2FMISC]> {
+    let Latency = 11;
+}
+def : WriteRes<WriteFDiv64, [XS2FMISC]> {
+    let Latency = 18;
+}
+
+def : WriteRes<WriteFSqrt32, [XS2FMISC]> {
+    let Latency = 17;
+}
+def : WriteRes<WriteFSqrt64, [XS2FMISC]> {
+    let Latency = 31;
+}
+
+// Others
+def : WriteRes<WriteCSR, [XS2MISC]>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+// Bypass and advance
+
+class XS2LoadToALUBypass<SchedRead read>
+    : ReadAdvance<read, 1, [WriteLDB, WriteLDH, WriteLDW, WriteLDD, WriteAtomicW, WriteAtomicD, WriteAtomicLDW, WriteAtomicLDD]>;
+
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : XS2LoadToALUBypass<ReadIALU>;
+def : XS2LoadToALUBypass<ReadIALU32>;
+def : XS2LoadToALUBypass<ReadShiftImm>;
+def : XS2LoadToALUBypass<ReadShiftImm32>;
+def : XS2LoadToALUBypass<ReadShiftReg>;
+def : XS2LoadToALUBypass<ReadShiftReg32>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 2>; // Cascade FMA
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 2>; // Cascade FMA
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Zb*
+// Zba
+def : XS2LoadToALUBypass<ReadSHXADD>;
+def : XS2LoadToALUBypass<ReadSHXADD32>;
+// Zbb
+def : XS2LoadToALUBypass<ReadRotateImm>;
+def : XS2LoadToALUBypass<ReadRotateImm32>;
+def : XS2LoadToALUBypass<ReadRotateReg>;
+def : XS2LoadToALUBypass<ReadRotateReg32>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : XS2LoadToALUBypass<ReadORCB>;
+def : XS2LoadToALUBypass<ReadREV8>;
+// Zbkc
+def : ReadAdvance<ReadCLMUL, 0>;
+// Zbs
+def : XS2LoadToALUBypass<ReadSingleBit>;
+def : XS2LoadToALUBypass<ReadSingleBitImm>;
+// Zbkb
+def : XS2LoadToALUBypass<ReadBREV8>;
+def : XS2LoadToALUBypass<ReadPACK>;
+def : XS2LoadToALUBypass<ReadPACK32>;
+def : XS2LoadToALUBypass<ReadZIP>;
+// Zbkx
+def : ReadAdvance<ReadXPERM, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZfh;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZabha;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 4c3da3ad3111..adef40e19cba 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -366,6 +366,7 @@ public:
 
   void addIRPasses() override;
   bool addPreISel() override;
+  void addCodeGenPrepare() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
@@ -452,6 +453,12 @@ bool RISCVPassConfig::addPreISel() {
   return false;
 }
 
+void RISCVPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createTypePromotionLegacyPass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool RISCVPassConfig::addInstSelector() {
   addPass(createRISCVISelDag(getRISCVTargetMachine(), getOptLevel()));
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index e4593e7db90e..8721b900c8be 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -13,6 +13,7 @@
 
 #include "SPIRVBuiltins.h"
 #include "SPIRV.h"
+#include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -82,6 +83,16 @@ struct GroupBuiltin {
 #define GET_GroupBuiltins_DECL
 #define GET_GroupBuiltins_IMPL
 
+struct IntelSubgroupsBuiltin {
+  StringRef Name;
+  uint32_t Opcode;
+  bool IsBlock;
+  bool IsWrite;
+};
+
+#define GET_IntelSubgroupsBuiltins_DECL
+#define GET_IntelSubgroupsBuiltins_IMPL
+
 struct GetBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
@@ -549,6 +560,7 @@ static bool buildAtomicCompareExchangeInst(const SPIRV::IncomingCall *Call,
   assert(GR->getSPIRVTypeForVReg(ObjectPtr)->getOpcode() ==
          SPIRV::OpTypePointer);
   unsigned ExpectedType = GR->getSPIRVTypeForVReg(ExpectedArg)->getOpcode();
+  (void)ExpectedType;
   assert(IsCmpxchg ? ExpectedType == SPIRV::OpTypeInt
                    : ExpectedType == SPIRV::OpTypePointer);
   assert(GR->isScalarOfType(Desired, SPIRV::OpTypeInt));
@@ -849,6 +861,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
   if (GroupBuiltin->HasBoolArg) {
     Register ConstRegister = Call->Arguments[0];
     auto ArgInstruction = getDefInstrMaybeConstant(ConstRegister, MRI);
+    (void)ArgInstruction;
     // TODO: support non-constant bool values.
     assert(ArgInstruction->getOpcode() == TargetOpcode::G_CONSTANT &&
            "Only constant bool value args are supported");
@@ -900,6 +913,67 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool generateIntelSubgroupsInst(const SPIRV::IncomingCall *Call,
+                                       MachineIRBuilder &MIRBuilder,
+                                       SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+    std::string DiagMsg = std::string(Builtin->Name) +
+                          ": the builtin requires the following SPIR-V "
+                          "extension: SPV_INTEL_subgroups";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+  const SPIRV::IntelSubgroupsBuiltin *IntelSubgroups =
+      SPIRV::lookupIntelSubgroupsBuiltin(Builtin->Name);
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  uint32_t OpCode = IntelSubgroups->Opcode;
+  if (IntelSubgroups->IsBlock) {
+    // Minimal number or arguments set in TableGen records is 1
+    if (SPIRVType *Arg0Type = GR->getSPIRVTypeForVReg(Call->Arguments[0])) {
+      if (Arg0Type->getOpcode() == SPIRV::OpTypeImage) {
+        // TODO: add required validation from the specification:
+        // "'Image' must be an object whose type is OpTypeImage with a 'Sampled'
+        // operand of 0 or 2. If the 'Sampled' operand is 2, then some
+        // dimensions require a capability."
+        switch (OpCode) {
+        case SPIRV::OpSubgroupBlockReadINTEL:
+          OpCode = SPIRV::OpSubgroupImageBlockReadINTEL;
+          break;
+        case SPIRV::OpSubgroupBlockWriteINTEL:
+          OpCode = SPIRV::OpSubgroupImageBlockWriteINTEL;
+          break;
+        }
+      }
+    }
+  }
+
+  // TODO: opaque pointers types should be eventually resolved in such a way
+  // that validation of block read is enabled with respect to the following
+  // specification requirement:
+  // "'Result Type' may be a scalar or vector type, and its component type must
+  // be equal to the type pointed to by 'Ptr'."
+  // For example, function parameter type should not be default i8 pointer, but
+  // depend on the result type of the instruction where it is used as a pointer
+  // argument of OpSubgroupBlockReadINTEL
+
+  // Build Intel subgroups instruction
+  MachineInstrBuilder MIB =
+      IntelSubgroups->IsWrite
+          ? MIRBuilder.buildInstr(OpCode)
+          : MIRBuilder.buildInstr(OpCode)
+                .addDef(Call->ReturnRegister)
+                .addUse(GR->getSPIRVTypeID(Call->ReturnType));
+  for (size_t i = 0; i < Call->Arguments.size(); ++i) {
+    MIB.addUse(Call->Arguments[i]);
+    MRI->setRegClass(Call->Arguments[i], &SPIRV::IDRegClass);
+  }
+
+  return true;
+}
+
 // These queries ask for a single size_t result for a given dimension index, e.g
 // size_t get_global_id(uint dimindex). In SPIR-V, the builtins corresonding to
 // these values are all vec3 types, so we need to extract the correct index or
@@ -1199,6 +1273,7 @@ static bool generateImageMiscQueryInst(const SPIRV::IncomingCall *Call,
   MIRBuilder.getMRI()->setRegClass(Image, &SPIRV::IDRegClass);
   SPIRV::Dim::Dim ImageDimensionality = static_cast<SPIRV::Dim::Dim>(
       GR->getSPIRVTypeForVReg(Image)->getOperand(2).getImm());
+  (void)ImageDimensionality;
 
   switch (Opcode) {
   case SPIRV::OpImageQuerySamples:
@@ -1976,6 +2051,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateVectorLoadStoreInst(Call.get(), MIRBuilder, GR);
   case SPIRV::LoadStore:
     return generateLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::IntelSubgroups:
+    return generateIntelSubgroupsInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
@@ -2119,6 +2196,7 @@ parseBuiltinTypeNameToTargetExtType(std::string TypeName,
   for (unsigned i = HasTypeParameter ? 1 : 0; i < Parameters.size(); i++) {
     unsigned IntParameter = 0;
     bool ValidLiteral = !Parameters[i].getAsInteger(10, IntParameter);
+    (void)ValidLiteral;
     assert(ValidLiteral &&
            "Invalid format of SPIR-V builtin parameter literal!");
     IntParameters.push_back(IntParameter);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 8acd4691787e..4013dd22f4ab 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -54,6 +54,7 @@ def Enqueue : BuiltinGroup;
 def AsyncCopy : BuiltinGroup;
 def VectorLoadStore : BuiltinGroup;
 def LoadStore : BuiltinGroup;
+def IntelSubgroups : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -625,7 +626,7 @@ def GroupBuiltins : GenericTable {
                 "IsBallotFindBit", "IsLogical", "NoGroupOperation", "HasBoolArg"];
 }
 
-// Function to lookup native builtins by their name and set.
+// Function to lookup group builtins by their name and set.
 def lookupGroupBuiltin : SearchIndex {
   let Table = GroupBuiltins;
   let Key = ["Name"];
@@ -871,6 +872,61 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
 defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
 defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
 
+//===----------------------------------------------------------------------===//
+// Class defining a sub group builtin that should be translated into a
+// SPIR-V instruction using the SPV_INTEL_subgroups extension.
+//
+// name is the demangled name of the given builtin.
+// opcode specifies the SPIR-V operation code of the generated instruction.
+//===----------------------------------------------------------------------===//
+class IntelSubgroupsBuiltin<string name, Op operation> {
+  string Name = name;
+  Op Opcode = operation;
+  bit IsBlock = !or(!eq(operation, OpSubgroupBlockReadINTEL),
+                    !eq(operation, OpSubgroupBlockWriteINTEL));
+  bit IsWrite = !eq(operation, OpSubgroupBlockWriteINTEL);
+}
+
+// Table gathering all the Intel sub group builtins.
+def IntelSubgroupsBuiltins : GenericTable {
+  let FilterClass = "IntelSubgroupsBuiltin";
+  let Fields = ["Name", "Opcode", "IsBlock", "IsWrite"];
+}
+
+// Function to lookup group builtins by their name and set.
+def lookupIntelSubgroupsBuiltin : SearchIndex {
+  let Table = IntelSubgroupsBuiltins;
+  let Key = ["Name"];
+}
+
+// Multiclass used to define incoming builtin records for the SPV_INTEL_subgroups extension
+// and corresponding work/sub group builtin records.
+multiclass DemangledIntelSubgroupsBuiltin<string name, bits<8> minNumArgs, bits<8> maxNumArgs, Op operation> {
+  def : DemangledBuiltin<!strconcat("intel_sub_group_", name), OpenCL_std, IntelSubgroups, minNumArgs, maxNumArgs>;
+  def : IntelSubgroupsBuiltin<!strconcat("intel_sub_group_", name), operation>;
+}
+
+// cl_intel_subgroups
+defm : DemangledIntelSubgroupsBuiltin<"shuffle", 2, 2, OpSubgroupShuffleINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_down", 3, 3, OpSubgroupShuffleDownINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_up", 3, 3, OpSubgroupShuffleUpINTEL>;
+defm : DemangledIntelSubgroupsBuiltin<"shuffle_xor", 2, 2, OpSubgroupShuffleXorINTEL>;
+foreach i = ["", "2", "4", "8"] in {
+  // cl_intel_subgroups
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read",  i), 1, 2, OpSubgroupBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write", i), 2, 3, OpSubgroupBlockWriteINTEL>;
+  // cl_intel_subgroups_short
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_ui",  i), 1, 2, OpSubgroupBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_ui", i), 2, 3, OpSubgroupBlockWriteINTEL>;
+}
+// cl_intel_subgroups_char, cl_intel_subgroups_short, cl_intel_subgroups_long
+foreach i = ["", "2", "4", "8", "16"] in {
+  foreach j = ["c", "s", "l"] in {
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_u", j,  i), 1, 2, OpSubgroupBlockReadINTEL>;
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_u", j, i), 2, 3, OpSubgroupBlockWriteINTEL>;
+  }
+}
+// OpSubgroupImageBlockReadINTEL and OpSubgroupImageBlockWriteINTEL are to be resolved later on (in code)
 
 //===----------------------------------------------------------------------===//
 // Class defining a get builtin record used for lowering builtin calls such as
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 97b25147ffb3..8ac498e1556b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -34,6 +34,12 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                     const Value *Val, ArrayRef<Register> VRegs,
                                     FunctionLoweringInfo &FLI,
                                     Register SwiftErrorVReg) const {
+  // Maybe run postponed production of types for function pointers
+  if (IndirectCalls.size() > 0) {
+    produceIndirectPtrTypes(MIRBuilder);
+    IndirectCalls.clear();
+  }
+
   // Currently all return types should use a single register.
   // TODO: handle the case of multiple registers.
   if (VRegs.size() > 1)
@@ -292,7 +298,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     }
   }
 
-  // Generate a SPIR-V type for the function.
   auto MRI = MIRBuilder.getMRI();
   Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
   MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
@@ -301,17 +306,17 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
   SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
       FTy, RetTy, ArgTypeVRegs, MIRBuilder);
-
-  // Build the OpTypeFunction declaring it.
   uint32_t FuncControl = getFunctionControl(F);
 
-  MIRBuilder.buildInstr(SPIRV::OpFunction)
-      .addDef(FuncVReg)
-      .addUse(GR->getSPIRVTypeID(RetTy))
-      .addImm(FuncControl)
-      .addUse(GR->getSPIRVTypeID(FuncTy));
+  // Add OpFunction instruction
+  MachineInstrBuilder MB = MIRBuilder.buildInstr(SPIRV::OpFunction)
+                               .addDef(FuncVReg)
+                               .addUse(GR->getSPIRVTypeID(RetTy))
+                               .addImm(FuncControl)
+                               .addUse(GR->getSPIRVTypeID(FuncTy));
+  GR->recordFunctionDefinition(&F, &MB.getInstr()->getOperand(0));
 
-  // Add OpFunctionParameters.
+  // Add OpFunctionParameter instructions
   int i = 0;
   for (const auto &Arg : F.args()) {
     assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs");
@@ -343,9 +348,56 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                     {static_cast<uint32_t>(LnkTy)}, F.getGlobalIdentifier());
   }
 
+  // Handle function pointers decoration
+  const auto *ST =
+      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+  bool hasFunctionPointers =
+      ST->canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+  if (hasFunctionPointers) {
+    if (F.hasFnAttribute("referenced-indirectly")) {
+      assert((F.getCallingConv() != CallingConv::SPIR_KERNEL) &&
+             "Unexpected 'referenced-indirectly' attribute of the kernel "
+             "function");
+      buildOpDecorate(FuncVReg, MIRBuilder,
+                      SPIRV::Decoration::ReferencedIndirectlyINTEL, {});
+    }
+  }
+
   return true;
 }
 
+// Used to postpone producing of indirect function pointer types after all
+// indirect calls info is collected
+// TODO:
+// - add a topological sort of IndirectCalls to ensure the best types knowledge
+// - we may need to fix function formal parameter types if they are opaque
+//   pointers used as function pointers in these indirect calls
+void SPIRVCallLowering::produceIndirectPtrTypes(
+    MachineIRBuilder &MIRBuilder) const {
+  // Create indirect call data types if any
+  MachineFunction &MF = MIRBuilder.getMF();
+  for (auto const &IC : IndirectCalls) {
+    SPIRVType *SpirvRetTy = GR->getOrCreateSPIRVType(IC.RetTy, MIRBuilder);
+    SmallVector<SPIRVType *, 4> SpirvArgTypes;
+    for (size_t i = 0; i < IC.ArgTys.size(); ++i) {
+      SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(IC.ArgTys[i], MIRBuilder);
+      SpirvArgTypes.push_back(SPIRVTy);
+      if (!GR->getSPIRVTypeForVReg(IC.ArgRegs[i]))
+        GR->assignSPIRVTypeToVReg(SPIRVTy, IC.ArgRegs[i], MF);
+    }
+    // SPIR-V function type:
+    FunctionType *FTy =
+        FunctionType::get(const_cast<Type *>(IC.RetTy), IC.ArgTys, false);
+    SPIRVType *SpirvFuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
+        FTy, SpirvRetTy, SpirvArgTypes, MIRBuilder);
+    // SPIR-V pointer to function type:
+    SPIRVType *IndirectFuncPtrTy = GR->getOrCreateSPIRVPointerType(
+        SpirvFuncTy, MIRBuilder, SPIRV::StorageClass::Function);
+    // Correct the Calee type
+    GR->assignSPIRVTypeToVReg(IndirectFuncPtrTy, IC.Callee, MF);
+  }
+}
+
 bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                   CallLoweringInfo &Info) const {
   // Currently call returns should have single vregs.
@@ -356,45 +408,44 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   GR->setCurrentFunc(MF);
   FunctionType *FTy = nullptr;
   const Function *CF = nullptr;
+  std::string DemangledName;
+  const Type *OrigRetTy = Info.OrigRet.Ty;
 
   // Emit a regular OpFunctionCall. If it's an externally declared function,
   // be sure to emit its type and function declaration here. It will be hoisted
   // globally later.
   if (Info.Callee.isGlobal()) {
+    std::string FuncName = Info.Callee.getGlobal()->getName().str();
+    DemangledName = getOclOrSpirvBuiltinDemangledName(FuncName);
     CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
     // TODO: support constexpr casts and indirect calls.
     if (CF == nullptr)
       return false;
-    FTy = getOriginalFunctionType(*CF);
+    if ((FTy = getOriginalFunctionType(*CF)) != nullptr)
+      OrigRetTy = FTy->getReturnType();
   }
 
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   Register ResVReg =
       Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
-  std::string FuncName = Info.Callee.getGlobal()->getName().str();
-  std::string DemangledName = getOclOrSpirvBuiltinDemangledName(FuncName);
   const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
   // TODO: check that it's OCL builtin, then apply OpenCL_std.
   if (!DemangledName.empty() && CF && CF->isDeclaration() &&
       ST->canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) {
-    const Type *OrigRetTy = Info.OrigRet.Ty;
-    if (FTy)
-      OrigRetTy = FTy->getReturnType();
     SmallVector<Register, 8> ArgVRegs;
     for (auto Arg : Info.OrigArgs) {
       assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
       ArgVRegs.push_back(Arg.Regs[0]);
       SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder);
       if (!GR->getSPIRVTypeForVReg(Arg.Regs[0]))
-        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());
+        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MF);
     }
     if (auto Res = SPIRV::lowerBuiltin(
             DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder,
             ResVReg, OrigRetTy, ArgVRegs, GR))
       return *Res;
   }
-  if (CF && CF->isDeclaration() &&
-      !GR->find(CF, &MIRBuilder.getMF()).isValid()) {
+  if (CF && CF->isDeclaration() && !GR->find(CF, &MF).isValid()) {
     // Emit the type info and forward function declaration to the first MBB
     // to ensure VReg definition dependencies are valid across all MBBs.
     MachineIRBuilder FirstBlockBuilder;
@@ -416,14 +467,40 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     lowerFormalArguments(FirstBlockBuilder, *CF, VRegArgs, FuncInfo);
   }
 
+  unsigned CallOp;
+  if (Info.CB->isIndirectCall()) {
+    if (!ST->canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers))
+      report_fatal_error("An indirect call is encountered but SPIR-V without "
+                         "extensions does not support it",
+                         false);
+    // Set instruction operation according to SPV_INTEL_function_pointers
+    CallOp = SPIRV::OpFunctionPointerCallINTEL;
+    // Collect information about the indirect call to support possible
+    // specification of opaque ptr types of parent function's parameters
+    Register CalleeReg = Info.Callee.getReg();
+    if (CalleeReg.isValid()) {
+      SPIRVCallLowering::SPIRVIndirectCall IndirectCall;
+      IndirectCall.Callee = CalleeReg;
+      IndirectCall.RetTy = OrigRetTy;
+      for (const auto &Arg : Info.OrigArgs) {
+        assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");
+        IndirectCall.ArgTys.push_back(Arg.Ty);
+        IndirectCall.ArgRegs.push_back(Arg.Regs[0]);
+      }
+      IndirectCalls.push_back(IndirectCall);
+    }
+  } else {
+    // Emit a regular OpFunctionCall
+    CallOp = SPIRV::OpFunctionCall;
+  }
+
   // Make sure there's a valid return reg, even for functions returning void.
   if (!ResVReg.isValid())
     ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass);
-  SPIRVType *RetType =
-      GR->assignTypeToVReg(FTy->getReturnType(), ResVReg, MIRBuilder);
+  SPIRVType *RetType = GR->assignTypeToVReg(OrigRetTy, ResVReg, MIRBuilder);
 
-  // Emit the OpFunctionCall and its args.
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall)
+  // Emit the call instruction and its args.
+  auto MIB = MIRBuilder.buildInstr(CallOp)
                  .addDef(ResVReg)
                  .addUse(GR->getSPIRVTypeID(RetType))
                  .add(Info.Callee);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
index c2d6ad82d507..48b3a5eb2671 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
@@ -26,6 +26,17 @@ private:
   // Used to create and assign function, argument, and return type information.
   SPIRVGlobalRegistry *GR;
 
+  // Used to postpone producing of indirect function pointer types
+  // after all indirect calls info is collected
+  struct SPIRVIndirectCall {
+    const Type *RetTy = nullptr;
+    SmallVector<Type *> ArgTys;
+    SmallVector<Register> ArgRegs;
+    Register Callee;
+  };
+  void produceIndirectPtrTypes(MachineIRBuilder &MIRBuilder) const;
+  mutable SmallVector<SPIRVIndirectCall> IndirectCalls;
+
 public:
   SPIRVCallLowering(const SPIRVTargetLowering &TLI, SPIRVGlobalRegistry *GR);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index cbe1a53fd756..d82fb2df4539 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -54,7 +54,14 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
         MachineOperand &Op = MI->getOperand(i);
         if (!Op.isReg())
           continue;
-        MachineOperand *RegOp = &MRI.getVRegDef(Op.getReg())->getOperand(0);
+        MachineInstr *VRegDef = MRI.getVRegDef(Op.getReg());
+        // References to a function via function pointers generate virtual
+        // registers without a definition. We are able to resolve this
+        // reference using Globar Register info into an OpFunction instruction
+        // but do not expect to find it in Reg2Entry.
+        if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
+          continue;
+        MachineOperand *RegOp = &VRegDef->getOperand(0);
         assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) ||
                Reg2Entry.count(RegOp));
         if (Reg2Entry.count(RegOp))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index f3280928c25d..792a00786f0a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -38,6 +38,12 @@ class SPIRVGlobalRegistry {
 
   DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
 
+  // map a Function to its definition (as a machine instruction operand)
+  DenseMap<const Function *, const MachineOperand *> FunctionToInstr;
+  // map function pointer (as a machine instruction operand) to the used
+  // Function
+  DenseMap<const MachineOperand *, const Function *> InstrToFunction;
+
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
   const MachineInstr *checkSpecialInstr(const SPIRV::SpecialTypeDescriptor &TD,
@@ -101,6 +107,29 @@ public:
     DT.buildDepsGraph(Graph, MMI);
   }
 
+  // Map a machine operand that represents a use of a function via function
+  // pointer to a machine operand that represents the function definition.
+  // Return either the register or invalid value, because we have no context for
+  // a good diagnostic message in case of unexpectedly missing references.
+  const MachineOperand *getFunctionDefinitionByUse(const MachineOperand *Use) {
+    auto ResF = InstrToFunction.find(Use);
+    if (ResF == InstrToFunction.end())
+      return nullptr;
+    auto ResReg = FunctionToInstr.find(ResF->second);
+    return ResReg == FunctionToInstr.end() ? nullptr : ResReg->second;
+  }
+  // map function pointer (as a machine instruction operand) to the used
+  // Function
+  void recordFunctionPointer(const MachineOperand *MO, const Function *F) {
+    InstrToFunction[MO] = F;
+  }
+  // map a Function to its definition (as a machine instruction)
+  void recordFunctionDefinition(const Function *F, const MachineOperand *MO) {
+    FunctionToInstr[F] = MO;
+  }
+  // Return true if any OpConstantFunctionPointerINTEL were generated
+  bool hasConstFunPtr() { return !InstrToFunction.empty(); }
+
   // Get or create a SPIR-V type corresponding the given LLVM IR type,
   // and map it to the given VReg by creating an ASSIGN_TYPE instruction.
   SPIRVType *assignTypeToVReg(const Type *Type, Register VReg,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 42317453a237..e3f76419f131 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -40,6 +40,7 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   case SPIRV::OpSpecConstantComposite:
   case SPIRV::OpSpecConstantOp:
   case SPIRV::OpUndef:
+  case SPIRV::OpConstantFunctionPointerINTEL:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index da033ba32624..904fef1d6c82 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -761,3 +761,31 @@ def OpGroupNonUniformBitwiseXor: OpGroupNUGroup<"BitwiseXor", 361>;
 def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
+
+// 3.49.7, Constant-Creation Instructions
+
+//  - SPV_INTEL_function_pointers
+def OpConstantFunctionPointerINTEL: Op<5600, (outs ID:$res), (ins TYPE:$ty, ID:$fun), "$res = OpConstantFunctionPointerINTEL $ty $fun">;
+
+// 3.49.9. Function Instructions
+
+//  - SPV_INTEL_function_pointers
+def OpFunctionPointerCallINTEL: Op<5601, (outs ID:$res), (ins TYPE:$ty, ID:$funPtr, variable_ops), "$res = OpFunctionPointerCallINTEL $ty $funPtr">;
+
+// 3.49.21. Group and Subgroup Instructions
+def OpSubgroupShuffleINTEL: Op<5571, (outs ID:$res), (ins TYPE:$type, ID:$data, ID:$invocationId),
+                  "$res = OpSubgroupShuffleINTEL $type $data $invocationId">;
+def OpSubgroupShuffleDownINTEL: Op<5572, (outs ID:$res), (ins TYPE:$type, ID:$current, ID:$next, ID:$delta),
+                  "$res = OpSubgroupShuffleDownINTEL $type $current $next $delta">;
+def OpSubgroupShuffleUpINTEL: Op<5573, (outs ID:$res), (ins TYPE:$type, ID:$previous, ID:$current, ID:$delta),
+                  "$res = OpSubgroupShuffleUpINTEL $type $previous $current $delta">;
+def OpSubgroupShuffleXorINTEL: Op<5574, (outs ID:$res), (ins TYPE:$type, ID:$data, ID:$value),
+                  "$res = OpSubgroupShuffleXorINTEL $type $data $value">;
+def OpSubgroupBlockReadINTEL: Op<5575, (outs ID:$res), (ins TYPE:$type, ID:$ptr),
+                  "$res = OpSubgroupBlockReadINTEL $type $ptr">;
+def OpSubgroupBlockWriteINTEL: Op<5576, (outs), (ins ID:$ptr, ID:$data),
+                  "OpSubgroupBlockWriteINTEL $ptr $data">;
+def OpSubgroupImageBlockReadINTEL: Op<5577, (outs ID:$res), (ins TYPE:$type, ID:$image, ID:$coordinate),
+                  "$res = OpSubgroupImageBlockReadINTEL $type $image $coordinate">;
+def OpSubgroupImageBlockWriteINTEL: Op<5578, (outs), (ins ID:$image, ID:$coordinate, ID:$data),
+                  "OpSubgroupImageBlockWriteINTEL $image $coordinate $data">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 8c1dfc5e626d..52eeb8a523e6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1534,6 +1534,12 @@ bool SPIRVInstructionSelector::selectGlobalValue(
     GlobalIdent = GV->getGlobalIdentifier();
   }
 
+  // Behaviour of functions as operands depends on availability of the
+  // corresponding extension (SPV_INTEL_function_pointers):
+  // - If there is an extension to operate with functions as operands:
+  // We create a proper constant operand and evaluate a correct type for a
+  // function pointer.
+  // - Without the required extension:
   // We have functions as operands in tests with blocks of instruction e.g. in
   // transcoding/global_block.ll. These operands are not used and should be
   // substituted by zero constants. Their type is expected to be always
@@ -1545,6 +1551,27 @@ bool SPIRVInstructionSelector::selectGlobalValue(
     if (!NewReg.isValid()) {
       Register NewReg = ResVReg;
       GR.add(ConstVal, GR.CurMF, NewReg);
+      const Function *GVFun =
+          STI.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)
+              ? dyn_cast<Function>(GV)
+              : nullptr;
+      if (GVFun) {
+        // References to a function via function pointers generate virtual
+        // registers without a definition. We will resolve it later, during
+        // module analysis stage.
+        MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+        Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
+        MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
+        MachineInstrBuilder MB =
+            BuildMI(BB, I, I.getDebugLoc(),
+                    TII.get(SPIRV::OpConstantFunctionPointerINTEL))
+                .addDef(NewReg)
+                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(FuncVReg);
+        // mapping the function pointer to the used Function
+        GR.recordFunctionPointer(&MB.getInstr()->getOperand(2), GVFun);
+        return MB.constrainAllUses(TII, TRI, RBI);
+      }
       return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
           .addDef(NewReg)
           .addUse(GR.getSPIRVTypeID(ResType))
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 370da046984f..a18aae1761c8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -291,6 +291,32 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   }
 }
 
+// References to a function via function pointers generate virtual
+// registers without a definition. We are able to resolve this
+// reference using Globar Register info into an OpFunction instruction
+// and replace dummy operands by the corresponding global register references.
+void SPIRVModuleAnalysis::collectFuncPtrs() {
+  for (auto &MI : MAI.MS[SPIRV::MB_TypeConstVars])
+    if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL)
+      collectFuncPtrs(MI);
+}
+
+void SPIRVModuleAnalysis::collectFuncPtrs(MachineInstr *MI) {
+  const MachineOperand *FunUse = &MI->getOperand(2);
+  if (const MachineOperand *FunDef = GR->getFunctionDefinitionByUse(FunUse)) {
+    const MachineInstr *FunDefMI = FunDef->getParent();
+    assert(FunDefMI->getOpcode() == SPIRV::OpFunction &&
+           "Constant function pointer must refer to function definition");
+    Register FunDefReg = FunDef->getReg();
+    Register GlobalFunDefReg =
+        MAI.getRegisterAlias(FunDefMI->getMF(), FunDefReg);
+    assert(GlobalFunDefReg.isValid() &&
+           "Function definition must refer to a global register");
+    Register FunPtrReg = FunUse->getReg();
+    MAI.setRegisterAlias(MI->getMF(), FunPtrReg, GlobalFunDefReg);
+  }
+}
+
 using InstrSignature = SmallVector<size_t>;
 using InstrTraces = std::set<InstrSignature>;
 
@@ -908,6 +934,29 @@ void addInstrRequirements(const MachineInstr &MI,
   case SPIRV::OpGroupNonUniformBallotFindMSB:
     Reqs.addCapability(SPIRV::Capability::GroupNonUniformBallot);
     break;
+  case SPIRV::OpSubgroupShuffleINTEL:
+  case SPIRV::OpSubgroupShuffleDownINTEL:
+  case SPIRV::OpSubgroupShuffleUpINTEL:
+  case SPIRV::OpSubgroupShuffleXorINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupShuffleINTEL);
+    }
+    break;
+  case SPIRV::OpSubgroupBlockReadINTEL:
+  case SPIRV::OpSubgroupBlockWriteINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupBufferBlockIOINTEL);
+    }
+    break;
+  case SPIRV::OpSubgroupImageBlockReadINTEL:
+  case SPIRV::OpSubgroupImageBlockWriteINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_subgroups);
+      Reqs.addCapability(SPIRV::Capability::SubgroupImageBlockIOINTEL);
+    }
+    break;
   case SPIRV::OpAssumeTrueKHR:
   case SPIRV::OpExpectKHR:
     if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume)) {
@@ -915,6 +964,18 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
     }
     break;
+  case SPIRV::OpConstantFunctionPointerINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+      Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
+    }
+    break;
+  case SPIRV::OpFunctionPointerCallINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
+      Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
+    }
+    break;
   default:
     break;
   }
@@ -1073,6 +1134,10 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   // Number rest of registers from N+1 onwards.
   numberRegistersGlobally(M);
 
+  // Update references to OpFunction instructions to use Global Registers
+  if (GR->hasConstFunPtr())
+    collectFuncPtrs();
+
   // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions.
   processOtherInstrs(M);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index d0b8027edd42..b05526b06e7d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -224,6 +224,8 @@ private:
   void collectFuncNames(MachineInstr &MI, const Function *F);
   void processOtherInstrs(const Module &M);
   void numberRegistersGlobally(const Module &M);
+  void collectFuncPtrs();
+  void collectFuncPtrs(MachineInstr *MI);
 
   const SPIRVSubtarget *ST;
   SPIRVGlobalRegistry *GR;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cf6dfb127cde..effedc2f17d3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -37,6 +37,11 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
                    "Adds OptNoneINTEL value for Function Control mask that "
                    "indicates a request to not optimize the function"),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
+                   "Allows work items in a subgroup to share data without the "
+                   "use of local memory and work group barriers, and to "
+                   "utilize specialized hardware to load and store blocks of "
+                   "data from images or buffers."),
         clEnumValN(SPIRV::Extension::SPV_KHR_no_integer_wrap_decoration,
                    "SPV_KHR_no_integer_wrap_decoration",
                    "Adds decorations to indicate that a given instruction does "
@@ -48,7 +53,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_KHR_bit_instructions,
                    "SPV_KHR_bit_instructions",
                    "This enables bit instructions to be used by SPIR-V modules "
-                   "without requiring the Shader capability")));
+                   "without requiring the Shader capability"),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
+                   "SPV_INTEL_function_pointers",
+                   "Allows translation of function pointers")));
 
 // Compare version numbers, but allow 0 to mean unspecified.
 static bool isAtLeastVer(uint32_t Target, uint32_t VerToCompareTo) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index ac92ee4a0756..5d252275ac70 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -295,6 +295,7 @@ defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>;
 defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
 defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
 defm SPV_INTEL_optnone : ExtensionOperand<103>;
+defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -431,9 +432,9 @@ defm InputAttachmentArrayNonUniformIndexingEXT : CapabilityOperand<5310, 0, 0, [
 defm UniformTexelBufferArrayNonUniformIndexingEXT : CapabilityOperand<5311, 0, 0, [], [SampledBuffer, ShaderNonUniformEXT]>;
 defm StorageTexelBufferArrayNonUniformIndexingEXT : CapabilityOperand<5312, 0, 0, [], [ImageBuffer, ShaderNonUniformEXT]>;
 defm RayTracingNV : CapabilityOperand<5340, 0, 0, [], [Shader]>;
-defm SubgroupShuffleINTEL : CapabilityOperand<5568, 0, 0, [], []>;
-defm SubgroupBufferBlockIOINTEL : CapabilityOperand<5569, 0, 0, [], []>;
-defm SubgroupImageBlockIOINTEL : CapabilityOperand<5570, 0, 0, [], []>;
+defm SubgroupShuffleINTEL : CapabilityOperand<5568, 0, 0, [SPV_INTEL_subgroups], []>;
+defm SubgroupBufferBlockIOINTEL : CapabilityOperand<5569, 0, 0, [SPV_INTEL_subgroups], []>;
+defm SubgroupImageBlockIOINTEL : CapabilityOperand<5570, 0, 0, [SPV_INTEL_subgroups], []>;
 defm SubgroupImageMediaBlockIOINTEL : CapabilityOperand<5579, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationINTEL : CapabilityOperand<5696, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationIntraINTEL : CapabilityOperand<5697, 0, 0, [], []>;
@@ -452,6 +453,8 @@ defm ArbitraryPrecisionIntegersINTEL : CapabilityOperand<5844, 0, 0, [SPV_INTEL_
 defm OptNoneINTEL : CapabilityOperand<6094, 0, 0, [SPV_INTEL_optnone], []>;
 defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], []>;
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
+defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -688,6 +691,7 @@ defm HitAttributeNV : StorageClassOperand<5339, [RayTracingNV]>;
 defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
 defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
 defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
+defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Dim enum values and at the same time
@@ -1179,6 +1183,8 @@ defm CountBuffer : DecorationOperand<5634, 0, 0, [], []>;
 defm UserSemantic : DecorationOperand<5635, 0, 0, [], []>;
 defm RestrictPointerEXT : DecorationOperand<5355, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm AliasedPointerEXT : DecorationOperand<5356, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
+defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectReferencesINTEL]>;
+defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define BuiltIn enum values and at the same time
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7b1033956524..38a59e650f33 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -72,6 +72,20 @@ def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
 //==== Features added predmoninantly for LEON subtarget support
 include "LeonFeatures.td"
 
+//==== Register allocation tweaks needed by some low-level software
+foreach i = 1 ... 7  in
+    def FeatureReserveG#i : SubtargetFeature<"reserve-g"#i, "ReserveRegister["#i#" + SP::G0]", "true",
+                                             "Reserve G"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveO#i : SubtargetFeature<"reserve-o"#i, "ReserveRegister["#i#" + SP::O0]", "true",
+                                             "Reserve O"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 7 in
+    def FeatureReserveL#i : SubtargetFeature<"reserve-l"#i, "ReserveRegister["#i#" + SP::L0]", "true",
+                                             "Reserve L"#i#", making it unavailable as a GPR">;
+foreach i = 0 ... 5 in
+    def FeatureReserveI#i : SubtargetFeature<"reserve-i"#i, "ReserveRegister["#i#" + SP::I0]", "true",
+                                             "Reserve I"#i#", making it unavailable as a GPR">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 78bdf3ae9a84..bdefb0841a12 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcISelLowering.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -729,6 +731,30 @@ SDValue SparcTargetLowering::LowerFormalArguments_64(
   return Chain;
 }
 
+// Check whether any of the argument registers are reserved
+static bool isAnyArgRegReserved(const SparcRegisterInfo *TRI,
+                                const MachineFunction &MF) {
+  // The register window design means that outgoing parameters at O*
+  // will appear in the callee as I*.
+  // Be conservative and check both sides of the register names.
+  bool Outgoing =
+      llvm::any_of(SP::GPROutgoingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  bool Incoming =
+      llvm::any_of(SP::GPRIncomingArgRegClass, [TRI, &MF](MCPhysReg r) {
+        return TRI->isReservedReg(MF, r);
+      });
+  return Outgoing || Incoming;
+}
+
+static void emitReservedArgRegCallError(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  F.getContext().diagnose(DiagnosticInfoUnsupported{
+      F, ("SPARC doesn't support"
+          " function calls if any of the argument registers is reserved.")});
+}
+
 SDValue
 SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -805,6 +831,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1055,6 +1082,10 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice)
            ? TRI->getRTCallPreservedMask(CallConv)
            : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -1125,6 +1156,13 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT,
     .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
     .Default(0);
 
+  // If we're directly referencing register names
+  // (e.g in GCC C extension `register int r asm("g1");`),
+  // make sure that said register is in the reserve list.
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (!TRI->isReservedReg(MF, Reg))
+    Reg = 0;
+
   if (Reg)
     return Reg;
 
@@ -1189,6 +1227,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1372,6 +1411,10 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
       ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
                          : TRI->getCallPreservedMask(DAG.getMachineFunction(),
                                                      CLI.CallConv));
+
+  if (isAnyArgRegReserved(TRI, MF))
+    emitReservedArgRegCallError(MF);
+
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index f97bf57627d1..71a27f77d2c6 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -12,10 +12,8 @@
 
 #include "SparcRegisterInfo.h"
 #include "Sparc.h"
-#include "SparcMachineFunctionInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -98,9 +96,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (unsigned n = 0; n < 31; n++)
     Reserved.set(SP::ASR1 + n);
 
+  for (TargetRegisterClass::iterator i = SP::IntRegsRegClass.begin();
+       i != SP::IntRegsRegClass.end(); ++i) {
+    if (MF.getSubtarget<SparcSubtarget>().isRegisterReserved(*i))
+      markSuperRegs(Reserved, *i);
+  }
+
+  assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
 
+bool SparcRegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      MCRegister Reg) const {
+  return getReservedRegs(MF)[Reg];
+}
+
 const TargetRegisterClass*
 SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
index 5b3c1a7ad07d..58c85f33635f 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -30,6 +30,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index d5ba7464695c..d8319a8d41dd 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -370,6 +370,10 @@ def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
+// GPR argument registers.
+def GPROutgoingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "O%u", 0, 5)>;
+def GPRIncomingArg : RegisterClass<"SP", [i32, i64], 32, (sequence "I%u", 0, 5)>;
+
 let isAllocatable = 0 in {
   // Ancillary state registers
   // FIXME: TICK is special-cased here as it can be accessed
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 6b09904ca5e8..5b65e34e0f8a 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -50,6 +50,7 @@ SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
                                const StringRef &FS, const TargetMachine &TM,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS),
+      ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()),
       TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this) {}
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index cdb210f67482..fe4aca519530 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -13,12 +13,14 @@
 #ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 #define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 
+#include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <string>
 
@@ -29,6 +31,10 @@ namespace llvm {
 class StringRef;
 
 class SparcSubtarget : public SparcGenSubtargetInfo {
+  // ReserveRegister[i] - Register #i is not available as a general purpose
+  // register.
+  BitVector ReserveRegister;
+
   Triple TargetTriple;
   virtual void anchor();
 
@@ -82,6 +88,10 @@ public:
     return is64Bit() ? 2047 : 0;
   }
 
+  bool isRegisterReserved(MCPhysReg PhysReg) const {
+    return ReserveRegister[PhysReg];
+  }
+
   /// Given a actual stack size as determined by FrameInfo, this function
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9c46cee572fc..da7dcbb25a95 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -121,6 +121,13 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
   return std::nullopt;
 }
 
+static std::optional<APInt> extractConstantBits(const Constant *C,
+                                                unsigned NumBits) {
+  if (std::optional<APInt> Bits = extractConstantBits(C))
+    return Bits->zextOrTrunc(NumBits);
+  return std::nullopt;
+}
+
 // Attempt to compute the splat width of bits data by normalizing the splat to
 // remove undefs.
 static std::optional<APInt> getSplatableConstant(const Constant *C,
@@ -217,16 +224,16 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy,
 
 // Attempt to rebuild a normalized splat vector constant of the requested splat
 // width, built up of potentially smaller scalar values.
-static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/,
-                                 unsigned SplatBitWidth) {
+static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/,
+                                 unsigned /*NumElts*/, unsigned SplatBitWidth) {
+  // TODO: Truncate to NumBits once ConvertToBroadcastAVX512 support this.
   std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
   if (!Splat)
     return nullptr;
 
   // Determine scalar size to use for the constant splat vector, clamping as we
   // might have found a splat smaller than the original constant data.
-  const Type *OriginalType = C->getType();
-  Type *SclTy = OriginalType->getScalarType();
+  Type *SclTy = C->getType()->getScalarType();
   unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
   NumSclBits = std::min<unsigned>(NumSclBits, SplatBitWidth);
 
@@ -236,20 +243,19 @@ static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumElts*/,
                    : 64;
 
   // Extract per-element bits.
-  return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
+  return rebuildConstant(C->getContext(), SclTy, *Splat, NumSclBits);
 }
 
-static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/,
+static Constant *rebuildZeroUpperCst(const Constant *C, unsigned NumBits,
+                                     unsigned /*NumElts*/,
                                      unsigned ScalarBitWidth) {
-  Type *Ty = C->getType();
-  Type *SclTy = Ty->getScalarType();
-  unsigned NumBits = Ty->getPrimitiveSizeInBits();
+  Type *SclTy = C->getType()->getScalarType();
   unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
   LLVMContext &Ctx = C->getContext();
 
   if (NumBits > ScalarBitWidth) {
     // Determine if the upper bits are all zero.
-    if (std::optional<APInt> Bits = extractConstantBits(C)) {
+    if (std::optional<APInt> Bits = extractConstantBits(C, NumBits)) {
       if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
         // If the original constant was made of smaller elements, try to retain
         // those types.
@@ -266,16 +272,15 @@ static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/,
   return nullptr;
 }
 
-static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts,
+static Constant *rebuildExtCst(const Constant *C, bool IsSExt,
+                               unsigned NumBits, unsigned NumElts,
                                unsigned SrcEltBitWidth) {
-  Type *Ty = C->getType();
-  unsigned NumBits = Ty->getPrimitiveSizeInBits();
   unsigned DstEltBitWidth = NumBits / NumElts;
   assert((NumBits % NumElts) == 0 && (NumBits % SrcEltBitWidth) == 0 &&
          (DstEltBitWidth % SrcEltBitWidth) == 0 &&
          (DstEltBitWidth > SrcEltBitWidth) && "Illegal extension width");
 
-  if (std::optional<APInt> Bits = extractConstantBits(C)) {
+  if (std::optional<APInt> Bits = extractConstantBits(C, NumBits)) {
     assert((Bits->getBitWidth() / DstEltBitWidth) == NumElts &&
            (Bits->getBitWidth() % DstEltBitWidth) == 0 &&
            "Unexpected constant extension");
@@ -290,19 +295,20 @@ static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts,
       TruncBits.insertBits(Elt.trunc(SrcEltBitWidth), I * SrcEltBitWidth);
     }
 
+    Type *Ty = C->getType();
     return rebuildConstant(Ty->getContext(), Ty->getScalarType(), TruncBits,
                            SrcEltBitWidth);
   }
 
   return nullptr;
 }
-static Constant *rebuildSExtCst(const Constant *C, unsigned NumElts,
-                                unsigned SrcEltBitWidth) {
-  return rebuildExtCst(C, true, NumElts, SrcEltBitWidth);
+static Constant *rebuildSExtCst(const Constant *C, unsigned NumBits,
+                                unsigned NumElts, unsigned SrcEltBitWidth) {
+  return rebuildExtCst(C, true, NumBits, NumElts, SrcEltBitWidth);
 }
-static Constant *rebuildZExtCst(const Constant *C, unsigned NumElts,
-                                unsigned SrcEltBitWidth) {
-  return rebuildExtCst(C, false, NumElts, SrcEltBitWidth);
+static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits,
+                                unsigned NumElts, unsigned SrcEltBitWidth) {
+  return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth);
 }
 
 bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
@@ -319,30 +325,33 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   struct FixupEntry {
     int Op;
     int NumCstElts;
-    int BitWidth;
-    std::function<Constant *(const Constant *, unsigned, unsigned)>
+    int MemBitWidth;
+    std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
         RebuildConstant;
   };
-  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned OperandNo) {
+  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
+                           unsigned OperandNo) {
 #ifdef EXPENSIVE_CHECKS
     assert(llvm::is_sorted(Fixups,
                            [](const FixupEntry &A, const FixupEntry &B) {
-                             return (A.NumCstElts * A.BitWidth) <
-                                    (B.NumCstElts * B.BitWidth);
+                             return (A.NumCstElts * A.MemBitWidth) <
+                                    (B.NumCstElts * B.MemBitWidth);
                            }) &&
            "Constant fixup table not sorted in ascending constant size");
 #endif
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
     if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+      RegBitWidth =
+          RegBitWidth ? RegBitWidth : C->getType()->getPrimitiveSizeInBits();
       for (const FixupEntry &Fixup : Fixups) {
         if (Fixup.Op) {
           // Construct a suitable constant and adjust the MI to use the new
           // constant pool entry.
-          if (Constant *NewCst =
-                  Fixup.RebuildConstant(C, Fixup.NumCstElts, Fixup.BitWidth)) {
+          if (Constant *NewCst = Fixup.RebuildConstant(
+                  C, RegBitWidth, Fixup.NumCstElts, Fixup.MemBitWidth)) {
             unsigned NewCPI =
-                CP->getConstantPoolIndex(NewCst, Align(Fixup.BitWidth / 8));
+                CP->getConstantPoolIndex(NewCst, Align(Fixup.MemBitWidth / 8));
             MI.setDesc(TII->get(Fixup.Op));
             MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
             return true;
@@ -371,7 +380,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     // TODO: SSE3 MOVDDUP Handling
     return FixupConstant({{X86::MOVSSrm, 1, 32, rebuildZeroUpperCst},
                           {X86::MOVSDrm, 1, 64, rebuildZeroUpperCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPDrm:
@@ -380,7 +389,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
@@ -388,7 +397,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
                           {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
-                         1);
+                         256, 1);
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPSZ128rm:
   case X86::VMOVUPDZ128rm:
@@ -397,7 +406,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPSZ256rm:
   case X86::VMOVUPDZ256rm:
@@ -406,7 +415,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
          {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
          {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
-        1);
+        256, 1);
   case X86::VMOVAPDZrm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPDZrm:
@@ -415,7 +424,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
                           {X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
-                         1);
+                         512, 1);
     /* Integer Loads */
   case X86::MOVDQArm:
   case X86::MOVDQUrm: {
@@ -434,7 +443,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasSSE41 ? X86::PMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
         {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
         {HasSSE41 ? X86::PMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm: {
@@ -459,7 +468,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDrm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQrm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm: {
@@ -484,7 +493,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasAVX2 ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
         {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
         {HasAVX2 ? X86::VPMOVZXDQYrm : 0, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Z128rm:
   case X86::VMOVDQA64Z128rm:
@@ -509,7 +518,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ128rm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQA32Z256rm:
   case X86::VMOVDQA64Z256rm:
@@ -533,7 +542,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ256rm, 8, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Zrm:
   case X86::VMOVDQA64Zrm:
@@ -558,7 +567,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZrm, 16, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 512, 1);
   }
   }
 
@@ -586,7 +595,9 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
       unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
       FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
                              {(int)OpBcst64, 64, 64, rebuildSplatCst}};
-      return FixupConstant(Fixups, OpNo);
+      // TODO: Add support for RegBitWidth, but currently rebuildSplatCst
+      // doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
+      return FixupConstant(Fixups, 0, OpNo);
     }
     return false;
   };
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5b76c66c2e4..18f9871b2bd0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7135,6 +7135,7 @@ static bool isFoldableUseOfShuffle(SDNode *N) {
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
+                                           const SDLoc &dl,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
@@ -7145,8 +7146,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
 
   MVT VT = BVOp->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
-  SDLoc dl(BVOp);
-
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
@@ -7492,14 +7491,13 @@ static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
 }
 
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
-static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
 
   MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
-
-  SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
       ISD::isBuildVectorAllOnes(Op.getNode()))
     return Op;
@@ -7618,7 +7616,7 @@ LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
 /// code because it is only used for partial h-op matching now?
 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
-                                  SelectionDAG &DAG,
+                                  const SDLoc &DL, SelectionDAG &DAG,
                                   unsigned BaseIdx, unsigned LastIdx,
                                   SDValue &V0, SDValue &V1) {
   EVT VT = N->getValueType(0);
@@ -7928,6 +7926,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
 /// X86ISD::FMSUBADD node.
 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+                                       const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
@@ -7938,7 +7937,6 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
     return SDValue();
 
   MVT VT = BV->getSimpleValueType(0);
-  SDLoc DL(BV);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
@@ -8057,22 +8055,22 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
 }
 
 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
-                                    SelectionDAG &DAG, unsigned HOpcode,
-                                    SDValue V0, SDValue V1) {
+                                    const SDLoc &DL, SelectionDAG &DAG,
+                                    unsigned HOpcode, SDValue V0, SDValue V1) {
   // If either input vector is not the same size as the build vector,
   // extract/insert the low bits to the correct size.
   // This is free (examples: zmm --> xmm, xmm --> ymm).
   MVT VT = BV->getSimpleValueType(0);
   unsigned Width = VT.getSizeInBits();
   if (V0.getValueSizeInBits() > Width)
-    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
+    V0 = extractSubVector(V0, 0, DAG, DL, Width);
   else if (V0.getValueSizeInBits() < Width)
-    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
+    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
 
   if (V1.getValueSizeInBits() > Width)
-    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
+    V1 = extractSubVector(V1, 0, DAG, DL, Width);
   else if (V1.getValueSizeInBits() < Width)
-    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
 
   unsigned NumElts = VT.getVectorNumElements();
   APInt DemandedElts = APInt::getAllOnes(NumElts);
@@ -8084,17 +8082,17 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
   unsigned HalfNumElts = NumElts / 2;
   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
     MVT HalfVT = VT.getHalfNumVectorElementsVT();
-    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
-    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
-    SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
-    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+    V0 = extractSubVector(V0, 0, DAG, DL, 128);
+    V1 = extractSubVector(V1, 0, DAG, DL, 128);
+    SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
+    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
   }
 
-  return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
+  return DAG.getNode(HOpcode, DL, VT, V0, V1);
 }
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
-static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   // We need at least 2 non-undef elements to make this worthwhile by default.
@@ -8114,7 +8112,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     unsigned HOpcode;
     SDValue V0, V1;
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
-      return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+      return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
   }
 
   // Try harder to match 256-bit ops by using extract/concat.
@@ -8134,22 +8132,21 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
     if (BV->getOperand(i)->isUndef())
       NumUndefsHI++;
 
-  SDLoc DL(BV);
   SDValue InVec0, InVec1;
   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     SDValue InVec2, InVec3;
     unsigned X86Opcode;
     bool CanFold = true;
 
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
-        isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
+    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
                               InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
                                    InVec1) &&
-             isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
+             isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
                                    InVec3) &&
              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
@@ -8179,15 +8176,16 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
       VT == MVT::v16i16) {
     unsigned X86Opcode;
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
+                              InVec1))
       X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::HSUB;
-    else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHADD;
-    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
+    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHSUB;
     else
@@ -8218,10 +8216,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 /// NOTE: Its not in our interest to start make a general purpose vectorizer
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
-static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -8296,9 +8293,9 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
 /// functionality to do this, so it's all zeros, all ones, or some derivation
 /// that is cheap to calculate.
-static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL,
+                                         SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
-  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
 
   // Vectors containing all zeros can be matched by pxor and xorps.
@@ -8322,7 +8319,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
 /// from a vector of source values and a vector of extraction indices.
 /// The vectors might be manipulated to match the type of the permute op.
 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
-                                     SDLoc &DL, SelectionDAG &DAG,
+                                     const SDLoc &DL, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   MVT ShuffleVT = VT;
   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
@@ -8590,7 +8587,8 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
 // TODO: Utilize pshufb and zero mask blending to support more efficient
 // construction of vectors with constant-0 elements.
 static SDValue
-LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
+                                   SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDValue SrcVec, IndicesVec;
   // Check for a match of the permute source vector and permute index elements.
@@ -8629,7 +8627,6 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
       return SDValue();
   }
 
-  SDLoc DL(V);
   MVT VT = V.getSimpleValueType();
   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
@@ -8645,14 +8642,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Generate vectors for predicate vectors.
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
-    return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
+    return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
 
   if (VT.getVectorElementType() == MVT::bf16 &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
-  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
-    return VectorConstant;
+  if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
+    return VectorCst;
 
   unsigned EVTBits = EltVT.getSizeInBits();
   APInt UndefMask = APInt::getZero(NumElems);
@@ -8747,13 +8744,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
     return AddSub;
-  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
     return HorizontalOp;
-  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
     return Broadcast;
-  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
     return BitOp;
 
   unsigned NumZero = ZeroMask.popcount();
@@ -8901,8 +8898,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
-  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
-      return V;
+  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
+    return V;
 
   // See if we can use a vector load to get all of the elements.
   {
@@ -32112,6 +32109,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+    // If we have at most 32 active bits, then perform as i32 CTPOP.
+    // TODO: Perform this in generic legalizer?
+    KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
+    unsigned LZ = Known.countMinLeadingZeros();
+    unsigned TZ = Known.countMinTrailingZeros();
+    if ((LZ + TZ) >= 32) {
+      SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
+                               DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
+      Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
+      Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
+      Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
+      Results.push_back(Op);
+      return;
+    }
     // Use a v2i64 if possible.
     bool NoImplicitFloatOps =
         DAG.getMachineFunction().getFunction().hasFnAttribute(
@@ -48009,28 +48020,26 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 // given x, y and z are of type \p VT. We can do so, if operands are either
 // truncates from VT types, the second operand is a vector of constants or can
 // be recursively promoted.
-static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
-                                     unsigned Depth) {
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
+                                     SelectionDAG &DAG, unsigned Depth) {
   // Limit recursion to avoid excessive compile times.
   if (Depth >= SelectionDAG::MaxRecursionDepth)
     return SDValue();
 
-  if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
-      N->getOpcode() != ISD::OR)
+  if (!ISD::isBitwiseLogicOp(N.getOpcode()))
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
+  SDValue N0 = N.getOperand(0);
+  SDValue N1 = N.getOperand(1);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+  if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
     return SDValue();
 
-  if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+  if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
     N0 = NN0;
   else {
-    // The Left side has to be a trunc.
+    // The left side has to be a trunc.
     if (N0.getOpcode() != ISD::TRUNCATE)
       return SDValue();
 
@@ -48041,22 +48050,22 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
     N0 = N0.getOperand(0);
   }
 
-  if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+  if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
     N1 = NN1;
   else {
-    // The right side has to be a 'trunc' or a constant vector.
+    // The right side has to be a 'trunc' or a (foldable) constant.
     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
                     N1.getOperand(0).getValueType() == VT;
-    if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
-      return SDValue();
-
     if (RHSTrunc)
       N1 = N1.getOperand(0);
+    else if (SDValue Cst =
+                 DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
+      N1 = Cst;
     else
-      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+      return SDValue();
   }
 
-  return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+  return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -48065,24 +48074,23 @@ static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
-static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
-  EVT VT = N->getValueType(0);
+  EVT VT = N.getValueType();
   assert(VT.isVector() && "Expected vector type");
+  assert((N.getOpcode() == ISD::ANY_EXTEND ||
+          N.getOpcode() == ISD::ZERO_EXTEND ||
+          N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
 
-  SDLoc DL(N);
-  assert((N->getOpcode() == ISD::ANY_EXTEND ||
-          N->getOpcode() == ISD::ZERO_EXTEND ||
-          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
-
-  SDValue Narrow = N->getOperand(0);
+  SDValue Narrow = N.getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
 
   // Generate the wide operation.
-  SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
   if (!Op)
     return SDValue();
-  switch (N->getOpcode()) {
+  switch (N.getOpcode()) {
   default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
@@ -52553,7 +52561,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
 
     // Attempt to promote any comparison mask ops before moving the
     // SIGN_EXTEND_INREG in the way.
-    if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+    if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
@@ -52774,7 +52782,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector()) {
-    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
       return R;
 
     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
@@ -52988,7 +52996,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     return V;
 
   if (VT.isVector())
-    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 0d30a3137772..0f21880f6df9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3423,6 +3423,18 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) {
   return Imm;
 }
 
+unsigned X86::getVectorRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 /// Return true if the Reg is X87 register.
 static bool isX87Reg(unsigned Reg) {
   return (Reg == X86::FPCW || Reg == X86::FPSW ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index ee0d2d059df8..996a24d9e8a9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -77,6 +77,9 @@ unsigned getSwappedVPCOMImm(unsigned Imm);
 /// Get the VCMP immediate if the opcodes are swapped.
 unsigned getSwappedVCMPImm(unsigned Imm);
 
+/// Get the width of the vector register operand.
+unsigned getVectorRegisterWidth(const MCOperandInfo &Info);
+
 /// Check if the instruction is X87 instruction.
 bool isX87Instruction(MachineInstr &MI);
 
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index b336ba3ea344..d3b7d97a83ca 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1388,18 +1388,6 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
   return MBBI;
 }
 
-static unsigned getRegisterWidth(const MCOperandInfo &Info) {
-  if (Info.RegClass == X86::VR128RegClassID ||
-      Info.RegClass == X86::VR128XRegClassID)
-    return 128;
-  if (Info.RegClass == X86::VR256RegClassID ||
-      Info.RegClass == X86::VR256XRegClassID)
-    return 256;
-  if (Info.RegClass == X86::VR512RegClassID)
-    return 512;
-  llvm_unreachable("Unknown register class!");
-}
-
 static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
   if (X86II::isKMasked(MI->getDesc().TSFlags)) {
     // Skip mask operand.
@@ -1648,7 +1636,7 @@ static void printZeroExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
   CS << " = ";
 
   SmallVector<int> Mask;
-  unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+  unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
   assert((Width % DstEltBits) == 0 && (DstEltBits % SrcEltBits) == 0 &&
          "Illegal extension ratio");
   DecodeZeroExtendMask(SrcEltBits, DstEltBits, Width / DstEltBits, false, Mask);
@@ -1753,7 +1741,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPSHUFBZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
@@ -1775,7 +1763,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMILPSZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, 32, Width, Mask);
       if (!Mask.empty())
@@ -1796,7 +1784,7 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMILPDZrmkz: {
     unsigned SrcIdx = getSrcIdx(MI, 1);
     if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, 64, Width, Mask);
       if (!Mask.empty())
@@ -1824,7 +1812,7 @@ static void addConstantComments(const MachineInstr *MI,
     }
 
     if (auto *C = X86::getConstantFromPool(*MI, 3)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
@@ -1835,7 +1823,7 @@ static void addConstantComments(const MachineInstr *MI,
 
   case X86::VPPERMrrm: {
     if (auto *C = X86::getConstantFromPool(*MI, 3)) {
-      unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+      unsigned Width = X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 2c660fad2ec7..3144327670c7 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -257,11 +257,11 @@ defm : X86WriteRes<WriteEMMS,          [ICXPort05,ICXPort0156], 10, [9,1], 10>;
 defm : ICXWriteResPair<WriteFAdd,      [ICXPort01],  4, [1], 1, 5>; // Floating point add/sub.
 defm : ICXWriteResPair<WriteFAddX,     [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFAddY,     [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFAddZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAddZ,     [ICXPort0],   4, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFAdd64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double add/sub.
 defm : ICXWriteResPair<WriteFAdd64X,   [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFAdd64Y,   [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFAdd64Z,   [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAdd64Z,   [ICXPort0],   4, [1], 1, 7>;
 
 defm : ICXWriteResPair<WriteFCmp,      [ICXPort01],  4, [1], 1, 5>; // Floating point compare.
 defm : ICXWriteResPair<WriteFCmpX,     [ICXPort01],  4, [1], 1, 6>;
@@ -278,11 +278,11 @@ defm : ICXWriteResPair<WriteFComX,      [ICXPort0],  2>; // Floating point compa
 defm : ICXWriteResPair<WriteFMul,      [ICXPort01],  4, [1], 1, 5>; // Floating point multiplication.
 defm : ICXWriteResPair<WriteFMulX,     [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFMulY,     [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFMulZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMulZ,     [ICXPort0],   4, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFMul64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double multiplication.
 defm : ICXWriteResPair<WriteFMul64X,   [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFMul64Y,   [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFMul64Z,   [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMul64Z,   [ICXPort0],   4, [1], 1, 7>;
 
 defm : ICXWriteResPair<WriteFDiv,     [ICXPort0,ICXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
 defm : ICXWriteResPair<WriteFDivX,    [ICXPort0,ICXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cd40b1d3b093..be1a09483089 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1540,6 +1540,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return ExtractCost + 2; // worst case pshufhw + pshufd
       }
     }
+    // If the extract subvector is not optimal, treat it as single op shuffle.
+    Kind = TTI::SK_PermuteSingleSrc;
   }
 
   // Subvector insertions are cheap if the subvectors are aligned.
diff --git a/llvm/lib/Target/Xtensa/CMakeLists.txt b/llvm/lib/Target/Xtensa/CMakeLists.txt
index 7192f7392072..2064511e75b8 100644
--- a/llvm/lib/Target/Xtensa/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_TARGET_DEFINITIONS Xtensa.td)
 
 tablegen(LLVM XtensaGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM XtensaGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM XtensaGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM XtensaGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM XtensaGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM XtensaGenMCCodeEmitter.inc -gen-emitter)
@@ -13,6 +14,13 @@ tablegen(LLVM XtensaGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(XtensaCommonTableGen)
 
 add_llvm_target(XtensaCodeGen
+  XtensaAsmPrinter.cpp
+  XtensaFrameLowering.cpp
+  XtensaInstrInfo.cpp
+  XtensaISelDAGToDAG.cpp
+  XtensaISelLowering.cpp
+  XtensaRegisterInfo.cpp
+  XtensaSubtarget.cpp
   XtensaTargetMachine.cpp
 
   LINK_COMPONENTS
@@ -20,6 +28,7 @@ add_llvm_target(XtensaCodeGen
   CodeGen
   Core
   MC
+  SelectionDAG
   Support
   Target
   XtensaDesc
diff --git a/llvm/lib/Target/Xtensa/Xtensa.h b/llvm/lib/Target/Xtensa/Xtensa.h
new file mode 100644
index 000000000000..da44e30f367f
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/Xtensa.h
@@ -0,0 +1,28 @@
+//===- Xtensa.h - Top-level interface for Xtensa representation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Xtensa back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSA_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSA_H
+
+#include "MCTargetDesc/XtensaMCTargetDesc.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+class XtensaTargetMachine;
+class FunctionPass;
+
+FunctionPass *createXtensaISelDag(XtensaTargetMachine &TM,
+                                  CodeGenOptLevel OptLevel);
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_XTENSA_XTENSA_H
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
new file mode 100644
index 000000000000..87dbf2eb5166
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -0,0 +1,70 @@
+//===- XtensaAsmPrinter.cpp Xtensa LLVM Assembly Printer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format Xtensa assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaAsmPrinter.h"
+#include "TargetInfo/XtensaTargetInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  MCInst LoweredMI;
+  lowerToMCInst(MI, LoweredMI);
+  EmitToStreamer(*OutStreamer, LoweredMI);
+}
+
+MCOperand XtensaAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                         unsigned Offset) const {
+  MachineOperand::MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      break;
+    return MCOperand::createReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm() + Offset);
+  case MachineOperand::MO_RegisterMask:
+    break;
+  default:
+    report_fatal_error("unknown operand type");
+  }
+
+  return MCOperand();
+}
+
+void XtensaAsmPrinter::lowerToMCInst(const MachineInstr *MI,
+                                     MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = lowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaAsmPrinter() {
+  RegisterAsmPrinter<XtensaAsmPrinter> A(getTheXtensaTarget());
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
new file mode 100644
index 000000000000..dec2a1ee4954
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -0,0 +1,46 @@
+//===- XtensaAsmPrinter.h - Xtensa LLVM Assembly Printer --------*- C++-*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Xtensa Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H
+
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY XtensaAsmPrinter : public AsmPrinter {
+  const MCSubtargetInfo *STI;
+
+public:
+  explicit XtensaAsmPrinter(TargetMachine &TM,
+                            std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+
+  StringRef getPassName() const override { return "Xtensa Assembly Printer"; }
+  void emitInstruction(const MachineInstr *MI) override;
+
+  // Lower MachineInstr MI to MCInst OutMI.
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) const;
+
+  // Return an MCOperand for MO.  Return an empty operand if MO is implicit.
+  MCOperand lowerOperand(const MachineOperand &MO, unsigned Offset = 0) const;
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAASMPRINTER_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
new file mode 100644
index 000000000000..2092a2d947f8
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -0,0 +1,39 @@
+//===- XtensaFrameLowering.cpp - Xtensa Frame Information -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaFrameLowering.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+XtensaFrameLowering::XtensaFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0,
+                          Align(4)) {}
+
+bool XtensaFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MFI.hasVarSizedObjects();
+}
+
+void XtensaFrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {}
+
+void XtensaFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {}
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
new file mode 100644
index 000000000000..19e52310a99d
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
@@ -0,0 +1,32 @@
+//===- XtensaFrameLowering.h - Define frame lowering for Xtensa --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------==//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+
+namespace llvm {
+class XtensaTargetMachine;
+class XtensaSubtarget;
+
+class XtensaFrameLowering : public TargetFrameLowering {
+public:
+  XtensaFrameLowering();
+
+  bool hasFP(const MachineFunction &MF) const override;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &, MachineBasicBlock &) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAFRAMELOWERING_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
new file mode 100644
index 000000000000..300737275452
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
@@ -0,0 +1,79 @@
+//===- XtensaISelDAGToDAG.cpp - A dag to dag inst selector for Xtensa -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Xtensa target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Xtensa.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xtensa-isel"
+
+namespace {
+
+class XtensaDAGToDAGISel : public SelectionDAGISel {
+public:
+  static char ID;
+
+  XtensaDAGToDAGISel(XtensaTargetMachine &TM, CodeGenOptLevel OptLevel)
+      : SelectionDAGISel(ID, TM, OptLevel) {}
+
+  StringRef getPassName() const override {
+    return "Xtensa DAG->DAG Pattern Instruction Selection";
+  }
+
+  void Select(SDNode *Node) override;
+
+  bool selectMemRegAddr(SDValue Addr, SDValue &Base, SDValue &Offset,
+                        int Scale) {
+    report_fatal_error("MemReg address is not implemented yet");
+  }
+
+  bool selectMemRegAddrISH1(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 1);
+  }
+
+  bool selectMemRegAddrISH2(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 2);
+  }
+
+  bool selectMemRegAddrISH4(SDValue Addr, SDValue &Base, SDValue &Offset) {
+    return selectMemRegAddr(Addr, Base, Offset, 4);
+  }
+
+// Include the pieces autogenerated from the target description.
+#include "XtensaGenDAGISel.inc"
+}; // namespace
+} // end anonymous namespace
+
+char XtensaDAGToDAGISel::ID = 0;
+
+FunctionPass *llvm::createXtensaISelDag(XtensaTargetMachine &TM,
+                                        CodeGenOptLevel OptLevel) {
+  return new XtensaDAGToDAGISel(TM, OptLevel);
+}
+
+void XtensaDAGToDAGISel::Select(SDNode *Node) {
+  SDLoc DL(Node);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    Node->setNodeId(-1);
+    return;
+  }
+
+  SelectCode(Node);
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
new file mode 100644
index 000000000000..276fab838d17
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -0,0 +1,58 @@
+//===- XtensaISelLowering.cpp - Xtensa DAG Lowering Implementation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Xtensa uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaISelLowering.h"
+#include "XtensaSubtarget.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xtensa-lower"
+
+XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
+                                           const XtensaSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Xtensa::ARRegClass);
+
+  // Set up special registers.
+  setStackPointerRegisterToSaveRestore(Xtensa::SP);
+
+  setSchedulingPreference(Sched::RegPressure);
+
+  setMinFunctionAlignment(Align(4));
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(STI.getRegisterInfo());
+}
+
+SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    report_fatal_error("Unexpected node to lower");
+  }
+}
+
+const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return nullptr;
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
new file mode 100644
index 000000000000..8b03712efc9b
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -0,0 +1,41 @@
+//===- XtensaISelLowering.h - Xtensa DAG Lowering Interface -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Xtensa uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+class XtensaSubtarget;
+
+class XtensaTargetLowering : public TargetLowering {
+public:
+  explicit XtensaTargetLowering(const TargetMachine &TM,
+                                const XtensaSubtarget &STI);
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const XtensaSubtarget &getSubtarget() const { return Subtarget; }
+
+private:
+  const XtensaSubtarget &Subtarget;
+};
+
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAISELLOWERING_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
new file mode 100644
index 000000000000..41b794d64fdb
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -0,0 +1,27 @@
+//===- XtensaInstrInfo.cpp - Xtensa Instruction Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaInstrInfo.h"
+#include "XtensaTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "XtensaGenInstrInfo.inc"
+
+using namespace llvm;
+
+XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
+    : XtensaGenInstrInfo(), RI(STI), STI(STI) {}
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
new file mode 100644
index 000000000000..8c73c9bd7940
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
@@ -0,0 +1,45 @@
+//===-- XtensaInstrInfo.h - Xtensa Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H
+
+#include "Xtensa.h"
+#include "XtensaRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+
+#include "XtensaGenInstrInfo.inc"
+
+namespace llvm {
+
+class XtensaTargetMachine;
+class XtensaSubtarget;
+class XtensaInstrInfo : public XtensaGenInstrInfo {
+  const XtensaRegisterInfo RI;
+  const XtensaSubtarget &STI;
+
+public:
+  XtensaInstrInfo(const XtensaSubtarget &STI);
+
+  // Return the XtensaRegisterInfo, which this class owns.
+  const XtensaRegisterInfo &getRegisterInfo() const { return RI; }
+
+  const XtensaSubtarget &getSubtarget() const { return STI; }
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSAINSTRINFO_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
new file mode 100644
index 000000000000..f749cc51f96a
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
@@ -0,0 +1,69 @@
+//===- XtensaRegisterInfo.cpp - Xtensa Register Information ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaRegisterInfo.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "xtensa-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XtensaGenRegisterInfo.inc"
+
+using namespace llvm;
+
+XtensaRegisterInfo::XtensaRegisterInfo(const XtensaSubtarget &STI)
+    : XtensaGenRegisterInfo(Xtensa::A0), Subtarget(STI) {}
+
+const uint16_t *
+XtensaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  // Calling convention is not implemented yet
+  return nullptr;
+}
+
+const uint32_t *
+XtensaRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                         CallingConv::ID) const {
+  // Calling convention is not implemented yet
+  return nullptr;
+}
+
+BitVector XtensaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  Reserved.set(Xtensa::A0);
+  if (TFI->hasFP(MF)) {
+    // Reserve frame pointer.
+    Reserved.set(getFrameRegister(MF));
+  }
+
+  // Reserve stack pointer.
+  Reserved.set(Xtensa::SP);
+  return Reserved;
+}
+
+bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                             int SPAdj, unsigned FIOperandNum,
+                                             RegScavenger *RS) const {
+  report_fatal_error("Eliminate frame index not supported yet");
+}
+
+Register XtensaRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  return TFI->hasFP(MF) ? Xtensa::A15 : Xtensa::SP;
+}
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h
new file mode 100644
index 000000000000..a4eda87f4c8b
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.h
@@ -0,0 +1,56 @@
+//===-- XtensaRegisterInfo.h - Xtensa Register Information Impl -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Xtensa implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAREGISTERINFO_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSAREGISTERINFO_H
+
+#include "Xtensa.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XtensaGenRegisterInfo.inc"
+
+namespace llvm {
+class TargetRegisterClass;
+class XtensaInstrInfo;
+class XtensaSubtarget;
+
+struct XtensaRegisterInfo : public XtensaGenRegisterInfo {
+public:
+  const XtensaSubtarget &Subtarget;
+
+  XtensaRegisterInfo(const XtensaSubtarget &STI);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  Register getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_XTENSA_REGISTERINFO_H
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp b/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp
new file mode 100644
index 000000000000..10ab92d2a2d3
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.cpp
@@ -0,0 +1,44 @@
+//===- XtensaSubtarget.cpp - Xtensa Subtarget Information -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Xtensa specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XtensaSubtarget.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "xtensa-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XtensaGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+XtensaSubtarget &
+XtensaSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  StringRef CPUName = CPU;
+  if (CPUName.empty()) {
+    // set default cpu name
+    CPUName = "generic";
+  }
+
+  HasDensity = false;
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, CPUName, FS);
+  return *this;
+}
+
+XtensaSubtarget::XtensaSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                                 const TargetMachine &TM)
+    : XtensaGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), TargetTriple(TT),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      TSInfo(), FrameLowering() {}
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
new file mode 100644
index 000000000000..948dcbc5278e
--- /dev/null
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -0,0 +1,72 @@
+//===-- XtensaSubtarget.h - Define Subtarget for the Xtensa ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Xtensa specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H
+#define LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H
+
+#include "XtensaFrameLowering.h"
+#include "XtensaISelLowering.h"
+#include "XtensaInstrInfo.h"
+#include "XtensaRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XtensaGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XtensaSubtarget : public XtensaGenSubtargetInfo {
+private:
+  const Triple &TargetTriple;
+  XtensaInstrInfo InstrInfo;
+  XtensaTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+  XtensaFrameLowering FrameLowering;
+
+  // Enabled Xtensa Density extension
+  bool HasDensity;
+
+  XtensaSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+public:
+  XtensaSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                  const TargetMachine &TM);
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const XtensaInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XtensaRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const XtensaTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  bool hasDensity() const { return HasDensity; }
+
+  // Automatically generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+};
+} // end namespace llvm
+
+#endif /* LLVM_LIB_TARGET_XTENSA_XTENSASUBTARGET_H */
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
index c891ecd9c0c3..49c7faf84df1 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
@@ -64,6 +64,45 @@ XtensaTargetMachine::XtensaTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOptLevel OL, bool JIT)
     : XtensaTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {}
 
+const XtensaSubtarget *
+XtensaTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  auto CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  auto FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<XtensaSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
+
+namespace {
+/// Xtensa Code Generator Pass Configuration Options.
+class XtensaPassConfig : public TargetPassConfig {
+public:
+  XtensaPassConfig(XtensaTargetMachine &TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  XtensaTargetMachine &getXtensaTargetMachine() const {
+    return getTM<XtensaTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+};
+} // end anonymous namespace
+
+bool XtensaPassConfig::addInstSelector() {
+  addPass(createXtensaISelDag(getXtensaTargetMachine(), getOptLevel()));
+  return false;
+}
+
 TargetPassConfig *XtensaTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new TargetPassConfig(*this, PM);
+  return new XtensaPassConfig(*this, PM);
 }
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.h b/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
index dd76f45b3bb7..f371f22ed3d0 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_XTENSA_XTENSATARGETMACHINE_H
 #define LLVM_LIB_TARGET_XTENSA_XTENSATARGETMACHINE_H
 
+#include "XtensaSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 
@@ -36,10 +37,16 @@ public:
                       std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
                       bool JIT);
 
+  const XtensaSubtarget *getSubtargetImpl(const Function &F) const override;
+
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+protected:
+  mutable StringMap<std::unique_ptr<XtensaSubtarget>> SubtargetMap;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 67f937ebc33f..fac701946e28 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -94,6 +94,64 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   llvm_unreachable("Unhandled architecture");
 }
 
+unsigned ARM::parseArchMinorVersion(StringRef Arch) {
+  Arch = getCanonicalArchName(Arch);
+  switch (parseArch(Arch)) {
+  case ArchKind::ARMV4:
+  case ArchKind::ARMV4T:
+  case ArchKind::ARMV5T:
+  case ArchKind::ARMV5TE:
+  case ArchKind::IWMMXT:
+  case ArchKind::IWMMXT2:
+  case ArchKind::XSCALE:
+  case ArchKind::ARMV5TEJ:
+  case ArchKind::ARMV6:
+  case ArchKind::ARMV6K:
+  case ArchKind::ARMV6T2:
+  case ArchKind::ARMV6KZ:
+  case ArchKind::ARMV6M:
+  case ArchKind::ARMV7A:
+  case ArchKind::ARMV7VE:
+  case ArchKind::ARMV7R:
+  case ArchKind::ARMV7M:
+  case ArchKind::ARMV7S:
+  case ArchKind::ARMV7EM:
+  case ArchKind::ARMV7K:
+  case ArchKind::ARMV8A:
+  case ArchKind::ARMV8R:
+  case ArchKind::ARMV8MBaseline:
+  case ArchKind::ARMV8MMainline:
+  case ArchKind::ARMV9A:
+  case ArchKind::INVALID:
+    return 0;
+  case ArchKind::ARMV8_1A:
+  case ArchKind::ARMV8_1MMainline:
+  case ArchKind::ARMV9_1A:
+    return 1;
+  case ArchKind::ARMV8_2A:
+  case ArchKind::ARMV9_2A:
+    return 2;
+  case ArchKind::ARMV8_3A:
+  case ArchKind::ARMV9_3A:
+    return 3;
+  case ArchKind::ARMV8_4A:
+  case ArchKind::ARMV9_4A:
+    return 4;
+  case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV9_5A:
+    return 5;
+  case ArchKind::ARMV8_6A:
+    return 6;
+  case ArchKind::ARMV8_7A:
+    return 7;
+  case ArchKind::ARMV8_8A:
+    return 8;
+  case ArchKind::ARMV8_9A:
+    return 9;
+  }
+  llvm_unreachable("Unhandled architecture");
+}
+
 static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {
   switch (AK) {
   case ARM::ArchKind::ARMV6M:
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index f1197c296553..4466d50458e1 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -321,6 +321,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     return StringSwitch<const char *>(Part)
         .Case("0xac3", "ampere1")
         .Case("0xac4", "ampere1a")
+        .Case("0xac5", "ampere1b")
         .Default("generic");
   }
 
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 20f324604aa5..684d698521e5 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -126,6 +126,11 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1151"},   {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+
+    {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+    {{"gfx10.1-generic"},   {"gfx10.1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
+    {{"gfx10.3-generic"},   {"gfx10.3-generic"}, GK_GFX10_3_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+    {{"gfx11-generic"},     {"gfx11-generic"},   GK_GFX11_GENERIC,   FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     // clang-format on
 };
 
@@ -144,6 +149,22 @@ const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
 
 } // namespace
 
+StringRef llvm::AMDGPU::getArchFamilyNameAMDGCN(GPUKind AK) {
+  switch (AK) {
+  case AMDGPU::GK_GFX9_GENERIC:
+    return "gfx9";
+  case AMDGPU::GK_GFX10_1_GENERIC:
+  case AMDGPU::GK_GFX10_3_GENERIC:
+    return "gfx10";
+  case AMDGPU::GK_GFX11_GENERIC:
+    return "gfx11";
+  default: {
+    StringRef ArchName = getArchNameAMDGCN(AK);
+    return ArchName.empty() ? "" : ArchName.drop_back(2);
+  }
+  }
+}
+
 StringRef llvm::AMDGPU::getArchNameAMDGCN(GPUKind AK) {
   if (const auto *Entry = getArchEntry(AK, AMDGCNGPUs))
     return Entry->CanonicalName;
@@ -253,6 +274,24 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1151: return {11, 5, 1};
   case GK_GFX1200: return {12, 0, 0};
   case GK_GFX1201: return {12, 0, 1};
+
+  // Generic targets return the lowest common denominator
+  // within their family. That is, the ISA that is the most
+  // restricted in terms of features.
+  //
+  // gfx9-generic is tricky because there is no lowest
+  // common denominator, so we return gfx900 which has mad-mix
+  // but this family doesn't have it.
+  //
+  // This API should never be used to check for a particular
+  // feature anyway.
+  //
+  // TODO: Split up this API depending on its caller so
+  // generic target handling is more obvious and less risky.
+  case GK_GFX9_GENERIC:    return {9, 0, 0};
+  case GK_GFX10_1_GENERIC: return {10, 1, 0};
+  case GK_GFX10_3_GENERIC: return {10, 3, 0};
+  case GK_GFX11_GENERIC:   return {11, 0, 3};
   default:         return {0, 0, 0};
   }
   // clang-format on
@@ -302,6 +341,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX1102:
     case GK_GFX1101:
     case GK_GFX1100:
+    case GK_GFX11_GENERIC:
       Features["ci-insts"] = true;
       Features["dot5-insts"] = true;
       Features["dot7-insts"] = true;
@@ -327,6 +367,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX1032:
     case GK_GFX1031:
     case GK_GFX1030:
+    case GK_GFX10_3_GENERIC:
       Features["ci-insts"] = true;
       Features["dot1-insts"] = true;
       Features["dot2-insts"] = true;
@@ -357,6 +398,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       [[fallthrough]];
     case GK_GFX1013:
     case GK_GFX1010:
+    case GK_GFX10_1_GENERIC:
       Features["dl-insts"] = true;
       Features["ci-insts"] = true;
       Features["16-bit-insts"] = true;
@@ -424,6 +466,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX904:
     case GK_GFX902:
     case GK_GFX900:
+    case GK_GFX9_GENERIC:
       Features["gfx9-insts"] = true;
       [[fallthrough]];
     case GK_GFX810:
@@ -510,6 +553,9 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) {
     case GK_GFX1011:
     case GK_GFX1013:
     case GK_GFX1010:
+    case GK_GFX11_GENERIC:
+    case GK_GFX10_3_GENERIC:
+    case GK_GFX10_1_GENERIC:
       IsWave32Capable = true;
       break;
     default:
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 3494ae52bf16..96dbd5ca673b 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -188,7 +188,7 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case spirv:
   case spirv32:
-  case spirv64:     return "spirv";
+  case spirv64:     return "spv";
 
   case kalimba:     return "kalimba";
   case lanai:       return "lanai";
diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
index 25b9499d1895..0694d8f28df6 100644
--- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
+++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/TextAPIError.h"
 #include <iomanip>
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index d712ed386825..9979df92674c 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/InterfaceFile.h"
+#include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/TextAPI/TextAPIError.h"
 #include <iomanip>
 #include <sstream>
@@ -351,6 +352,34 @@ InterfaceFile::extract(Architecture Arch) const {
   return std::move(IF);
 }
 
+void InterfaceFile::setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                                       const Target &Targ) {
+  if (getFileType() != BA.File)
+    setFileType(BA.File);
+  if (getInstallName().empty())
+    setInstallName(BA.InstallName);
+  if (BA.AppExtensionSafe && !isApplicationExtensionSafe())
+    setApplicationExtensionSafe();
+  if (BA.TwoLevelNamespace && !isTwoLevelNamespace())
+    setTwoLevelNamespace();
+  if (BA.OSLibNotForSharedCache && !isOSLibNotForSharedCache())
+    setOSLibNotForSharedCache();
+  if (getCurrentVersion().empty())
+    setCurrentVersion(BA.CurrentVersion);
+  if (getCompatibilityVersion().empty())
+    setCompatibilityVersion(BA.CompatVersion);
+  if (getSwiftABIVersion() == 0)
+    setSwiftABIVersion(BA.SwiftABI);
+  if (getPath().empty())
+    setPath(BA.Path);
+  if (!BA.ParentUmbrella.empty())
+    addParentUmbrella(Targ, BA.ParentUmbrella);
+  for (const auto &Client : BA.AllowableClients)
+    addAllowableClient(Client, Targ);
+  for (const auto &Lib : BA.RexportedLibraries)
+    addReexportedLibrary(Lib, Targ);
+}
+
 static bool isYAMLTextStub(const FileType &Kind) {
   return (Kind >= FileType::TBD_V1) && (Kind < FileType::TBD_V5);
 }
diff --git a/llvm/lib/TextAPI/RecordsSlice.cpp b/llvm/lib/TextAPI/RecordsSlice.cpp
index f07853e57924..fb961a91cc65 100644
--- a/llvm/lib/TextAPI/RecordsSlice.cpp
+++ b/llvm/lib/TextAPI/RecordsSlice.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/TextAPI/RecordsSlice.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/Symbol.h"
 #include <utility>
@@ -325,28 +326,7 @@ createInterfaceFile(const Records &Slices, StringRef InstallName) {
       continue;
     const Target &Targ = S->getTarget();
     File->addTarget(Targ);
-    if (File->getFileType() == FileType::Invalid)
-      File->setFileType(BA.File);
-    if (BA.AppExtensionSafe && !File->isApplicationExtensionSafe())
-      File->setApplicationExtensionSafe();
-    if (BA.TwoLevelNamespace && !File->isTwoLevelNamespace())
-      File->setTwoLevelNamespace();
-    if (BA.OSLibNotForSharedCache && !File->isOSLibNotForSharedCache())
-      File->setOSLibNotForSharedCache();
-    if (File->getCurrentVersion().empty())
-      File->setCurrentVersion(BA.CurrentVersion);
-    if (File->getCompatibilityVersion().empty())
-      File->setCompatibilityVersion(BA.CompatVersion);
-    if (File->getSwiftABIVersion() == 0)
-      File->setSwiftABIVersion(BA.SwiftABI);
-    if (File->getPath().empty())
-      File->setPath(BA.Path);
-    if (!BA.ParentUmbrella.empty())
-      File->addParentUmbrella(Targ, BA.ParentUmbrella);
-    for (const auto &Client : BA.AllowableClients)
-      File->addAllowableClient(Client, Targ);
-    for (const auto &Lib : BA.RexportedLibraries)
-      File->addReexportedLibrary(Lib, Targ);
+    File->setFromBinaryAttrs(BA, Targ);
   }
 
   return File;
diff --git a/llvm/lib/TextAPI/TextAPIContext.h b/llvm/lib/TextAPI/TextAPIContext.h
index 217d1f5400ee..3151db7b6b80 100644
--- a/llvm/lib/TextAPI/TextAPIContext.h
+++ b/llvm/lib/TextAPI/TextAPIContext.h
@@ -13,13 +13,12 @@
 #ifndef LLVM_TEXTAPI_MACHO_CONTEXT_H
 #define LLVM_TEXTAPI_MACHO_CONTEXT_H
 
+#include "llvm/TextAPI/FileTypes.h"
 #include <string>
 
 namespace llvm {
 namespace MachO {
 
-enum FileType : unsigned;
-
 struct TextAPIContext {
   std::string ErrorMessage;
   std::string Path;
diff --git a/llvm/lib/TextAPI/TextStubCommon.h b/llvm/lib/TextAPI/TextStubCommon.h
index 360910c48d4f..155d38a30ba0 100644
--- a/llvm/lib/TextAPI/TextStubCommon.h
+++ b/llvm/lib/TextAPI/TextStubCommon.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 #define LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/TextAPI/Architecture.h"
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 5d7ec0fb0309..c3015d895230 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -392,9 +392,34 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
       return 1;
     }
 
-    return writeImportLibrary(Def->OutputFile, OutputPath, Def->Exports,
-                              LibMachine,
-                              /*MinGW=*/false)
+    std::vector<COFFShortExport> NativeExports;
+    std::string OutputFile = Def->OutputFile;
+
+    if (isArm64EC(LibMachine) && Args.hasArg(OPT_nativedeffile)) {
+      std::unique_ptr<MemoryBuffer> NativeMB =
+          openFile(Args.getLastArg(OPT_nativedeffile)->getValue());
+      if (!NativeMB)
+        return 1;
+
+      if (!NativeMB->getBufferSize()) {
+        llvm::errs() << "native definition file empty\n";
+        return 1;
+      }
+
+      Expected<COFFModuleDefinition> NativeDef =
+          parseCOFFModuleDefinition(*NativeMB, COFF::IMAGE_FILE_MACHINE_ARM64);
+
+      if (!NativeDef) {
+        llvm::errs() << "error parsing native definition\n"
+                     << errorToErrorCode(NativeDef.takeError()).message();
+        return 1;
+      }
+      NativeExports = std::move(NativeDef->Exports);
+      OutputFile = std::move(NativeDef->OutputFile);
+    }
+
+    return writeImportLibrary(OutputFile, OutputPath, Def->Exports, LibMachine,
+                              /*MinGW=*/false, NativeExports)
                ? 1
                : 0;
   }
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 22ac1fb842e4..a3d901d77054 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -23,6 +23,7 @@ def libpath: P<"libpath", "Object file search path">;
 def lst    : F<"list">, HelpText<"List contents of .lib file on stdout">;
 def out    : P<"out", "Path to file to write output">;
 def deffile : P<"def", "def file to use to generate import library">;
+def nativedeffile : P<"defArm64Native", "def file to use to generate native ARM64 symbols in ARM64EC import library">;
 
 def llvmlibthin : F<"llvmlibthin">,
     HelpText<"Make .lib point to .obj files instead of copying their contents">;
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 733f290b1bc9..633fcb3314c4 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -470,6 +470,9 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
+  GlobalVariable *GlobalAnnotation;
+  DenseSet<Value *> FunctionAnnotations;
+
   bool shouldExportConstantsAsAbsoluteSymbols();
   uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
@@ -531,6 +534,10 @@ class LowerTypeTestsModule {
   /// replace each use, which is a direct function call.
   void replaceDirectCalls(Value *Old, Value *New);
 
+  bool isFunctionAnnotation(Value *V) const {
+    return FunctionAnnotations.contains(V);
+  }
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1377,8 +1384,11 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
   findGlobalVariableUsersOf(F, GlobalVarUsers);
-  for (auto *GV : GlobalVarUsers)
+  for (auto *GV : GlobalVarUsers) {
+    if (GV == GlobalAnnotation)
+      continue;
     moveInitializerToModuleConstructor(GV);
+  }
 
   // Can not RAUW F with an expression that uses F. Replace with a temporary
   // placeholder first.
@@ -1837,6 +1847,16 @@ LowerTypeTestsModule::LowerTypeTestsModule(
   }
   OS = TargetTriple.getOS();
   ObjectFormat = TargetTriple.getObjectFormat();
+
+  // Function annotation describes or applies to function itself, and
+  // shouldn't be associated with jump table thunk generated for CFI.
+  GlobalAnnotation = M.getGlobalVariable("llvm.global.annotations");
+  if (GlobalAnnotation && GlobalAnnotation->hasInitializer()) {
+    const ConstantArray *CA =
+        cast<ConstantArray>(GlobalAnnotation->getInitializer());
+    for (Value *Op : CA->operands())
+      FunctionAnnotations.insert(Op);
+  }
 }
 
 bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) {
@@ -1896,10 +1916,14 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
     if (isa<BlockAddress, NoCFIValue>(U.getUser()))
       continue;
 
-    // Skip direct calls to externally defined or non-dso_local functions
+    // Skip direct calls to externally defined or non-dso_local functions.
     if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
       continue;
 
+    // Skip function annotation.
+    if (isFunctionAnnotation(U.getUser()))
+      continue;
+
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index aa3b9da924aa..4465eb8992fb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2217,47 +2217,47 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I,
   }
 }
 
-// Try to simplify X | Y by replacing occurrences of Y in X with 0.
-// Similarly, simplify X & Y by replacing occurrences of Y in X with -1.
+// Try to simplify V by replacing occurrences of Op with RepOp, but only look
+// through bitwise operations. In particular, for X | Y we try to replace Y with
+// 0 inside X and for X & Y we try to replace Y with -1 inside X.
 // Return the simplified result of X if successful, and nullptr otherwise.
-static Value *simplifyAndOrWithOpReplaced(Value *X, Value *Y, bool IsAnd,
+// If SimplifyOnly is true, no new instructions will be created.
+static Value *simplifyAndOrWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                          bool SimplifyOnly,
                                           InstCombinerImpl &IC,
                                           unsigned Depth = 0) {
-  if (isa<Constant>(X) || X == Y)
+  if (Op == RepOp)
     return nullptr;
 
-  Value *RHS;
-  if (match(X, m_c_And(m_Specific(Y), m_Value(RHS)))) {
-    return IsAnd ? RHS : Constant::getNullValue(X->getType());
-  } else if (match(X, m_c_Or(m_Specific(Y), m_Value(RHS)))) {
-    return IsAnd ? Constant::getAllOnesValue(X->getType()) : RHS;
-  } else if (match(X, m_c_Xor(m_Specific(Y), m_Value(RHS)))) {
-    if (IsAnd) {
-      if (X->hasOneUse())
-        return IC.Builder.CreateNot(RHS);
+  if (V == Op)
+    return RepOp;
 
-      if (Value *NotRHS =
-              IC.getFreelyInverted(RHS, RHS->hasOneUse(), &IC.Builder))
-        return NotRHS;
-    } else
-      return RHS;
-  }
+  auto *I = dyn_cast<BinaryOperator>(V);
+  if (!I || !I->isBitwiseLogicOp() || Depth >= 3)
+    return nullptr;
 
-  // Replace uses of Y in X recursively.
-  Value *Op0, *Op1;
-  if (Depth < 2 && match(X, m_BitwiseLogic(m_Value(Op0), m_Value(Op1)))) {
-    // TODO: Relax the one-use constraint to clean up existing hard-coded
-    // simplifications.
-    if (!X->hasOneUse())
-      return nullptr;
-    Value *NewOp0 = simplifyAndOrWithOpReplaced(Op0, Y, IsAnd, IC, Depth + 1);
-    Value *NewOp1 = simplifyAndOrWithOpReplaced(Op1, Y, IsAnd, IC, Depth + 1);
-    if (!NewOp0 && !NewOp1)
-      return nullptr;
-    return IC.Builder.CreateBinOp(cast<BinaryOperator>(X)->getOpcode(),
-                                  NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
-  }
-  return nullptr;
+  if (!I->hasOneUse())
+    SimplifyOnly = true;
+
+  Value *NewOp0 = simplifyAndOrWithOpReplaced(I->getOperand(0), Op, RepOp,
+                                              SimplifyOnly, IC, Depth + 1);
+  Value *NewOp1 = simplifyAndOrWithOpReplaced(I->getOperand(1), Op, RepOp,
+                                              SimplifyOnly, IC, Depth + 1);
+  if (!NewOp0 && !NewOp1)
+    return nullptr;
+
+  if (!NewOp0)
+    NewOp0 = I->getOperand(0);
+  if (!NewOp1)
+    NewOp1 = I->getOperand(1);
+
+  if (Value *Res = simplifyBinOp(I->getOpcode(), NewOp0, NewOp1,
+                                 IC.getSimplifyQuery().getWithInstruction(I)))
+    return Res;
+
+  if (SimplifyOnly)
+    return nullptr;
+  return IC.Builder.CreateBinOp(I->getOpcode(), NewOp0, NewOp1);
 }
 
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
@@ -2781,9 +2781,13 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder))
     return Res;
 
-  if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ true, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getAllOnesValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(V, Op1);
-  if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ true, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getAllOnesValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
   return nullptr;
@@ -3602,14 +3606,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
       return BinaryOperator::CreateOr(Op1, C);
 
-  // ((A & B) ^ C) | B -> C | B
-  if (match(Op0, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op1)), m_Value(C))))
-    return BinaryOperator::CreateOr(C, Op1);
-
-  // B | ((A & B) ^ C) -> B | C
-  if (match(Op1, m_c_Xor(m_c_And(m_Value(A), m_Specific(Op0)), m_Value(C))))
-    return BinaryOperator::CreateOr(Op0, C);
-
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
     return DeMorgan;
 
@@ -3664,19 +3660,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     }
   }
 
-  // A | ~(A | B) -> A | ~B
-  // A | ~(A ^ B) -> A | ~B
-  if (match(Op1, m_Not(m_Value(A))))
-    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
-      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
-          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
-                               B->getOpcode() == Instruction::Xor)) {
-        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
-                                                 B->getOperand(0);
-        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
-        return BinaryOperator::CreateOr(Not, Op0);
-      }
-
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
@@ -3965,9 +3948,13 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *Res = foldBitwiseLogicWithIntrinsics(I, Builder))
     return Res;
 
-  if (Value *V = simplifyAndOrWithOpReplaced(Op0, Op1, /*IsAnd*/ false, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op0, Op1, Constant::getNullValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateOr(V, Op1);
-  if (Value *V = simplifyAndOrWithOpReplaced(Op1, Op0, /*IsAnd*/ false, *this))
+  if (Value *V =
+          simplifyAndOrWithOpReplaced(Op1, Op0, Constant::getNullValue(Ty),
+                                      /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateOr(Op0, V);
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index ed5d44757fbe..5266808c5aba 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,20 +172,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   // If the memcpy has metadata describing the members, see if we can get the
   // TBAA tag describing our copy.
-  AAMDNodes AACopyMD = MI->getAAMetadata();
-
-  if (MDNode *M = AACopyMD.TBAAStruct) {
-    AACopyMD.TBAAStruct = nullptr;
-    if (M->getNumOperands() == 3 && M->getOperand(0) &&
-        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
-        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
-        M->getOperand(1) &&
-        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
-        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
-        Size &&
-        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-      AACopyMD.TBAA = cast<MDNode>(M->getOperand(2));
-  }
+  AAMDNodes AACopyMD = MI->getAAMetadata().adjustForAccess(Size);
 
   Value *Src = MI->getArgOperand(1);
   Value *Dest = MI->getArgOperand(0);
@@ -915,7 +902,8 @@ Instruction *InstCombinerImpl::foldIntrinsicIsFPClass(IntrinsicInst &II) {
   const FPClassTest OrderedMask = Mask & ~fcNan;
   const FPClassTest OrderedInvertedMask = ~OrderedMask & ~fcNan;
 
-  const bool IsStrict = II.isStrictFP();
+  const bool IsStrict =
+      II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP);
 
   Value *FNegSrc;
   if (match(Src0, m_FNeg(m_Value(FNegSrc)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7aac13fd474b..280c4d77b6df 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -813,14 +813,30 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       }
     }
 
-    // Only lower this if the icmp is the only user of the GEP or if we expect
-    // the result to fold to a constant!
-    if ((GEPsInBounds || CmpInst::isEquality(Cond)) &&
-        (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
-        (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse())) {
+    if (GEPsInBounds || CmpInst::isEquality(Cond)) {
+      auto EmitGEPOffsetAndRewrite = [&](GEPOperator *GEP) {
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        auto *Inst = dyn_cast<Instruction>(GEP);
+        if (Inst)
+          Builder.SetInsertPoint(Inst);
+
+        Value *Offset = EmitGEPOffset(GEP);
+        // If a non-trivial GEP has other uses, rewrite it to avoid duplicating
+        // the offset arithmetic.
+        if (Inst && !GEP->hasOneUse() && !GEP->hasAllConstantIndices() &&
+            !GEP->getSourceElementType()->isIntegerTy(8)) {
+          replaceInstUsesWith(*Inst,
+                              Builder.CreateGEP(Builder.getInt8Ty(),
+                                                GEP->getPointerOperand(),
+                                                Offset, "", GEPsInBounds));
+          eraseInstFromFunction(*Inst);
+        }
+        return Offset;
+      };
+
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
-      Value *L = EmitGEPOffset(GEPLHS);
-      Value *R = EmitGEPOffset(GEPRHS);
+      Value *L = EmitGEPOffsetAndRewrite(GEPLHS);
+      Value *R = EmitGEPOffsetAndRewrite(GEPRHS);
       return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
     }
   }
@@ -3229,16 +3245,16 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
       if (Cmp.isEquality() && match(Op1, m_Zero()))
         return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
 
-    // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
-    // the FP extend/truncate because that cast does not change the sign-bit.
-    // This is true for all standard IEEE-754 types and the X86 80-bit type.
-    // The sign-bit is always the most significant bit in those types.
     const APInt *C;
     bool TrueIfSigned;
-    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        isSignBitCheck(Pred, *C, TrueIfSigned)) {
-      if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
-          match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
+    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse()) {
+      // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
+      // the FP extend/truncate because that cast does not change the sign-bit.
+      // This is true for all standard IEEE-754 types and the X86 80-bit type.
+      // The sign-bit is always the most significant bit in those types.
+      if (isSignBitCheck(Pred, *C, TrueIfSigned) &&
+          (match(BCSrcOp, m_FPExt(m_Value(X))) ||
+           match(BCSrcOp, m_FPTrunc(m_Value(X))))) {
         // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
         // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
         Type *XType = X->getType();
@@ -3257,6 +3273,20 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
                                 ConstantInt::getAllOnesValue(NewType));
         }
       }
+
+      // icmp eq/ne (bitcast X to int), special fp -> llvm.is.fpclass(X, class)
+      Type *FPType = SrcType->getScalarType();
+      if (!Cmp.getParent()->getParent()->hasFnAttribute(
+              Attribute::NoImplicitFloat) &&
+          Cmp.isEquality() && FPType->isIEEELikeFPTy()) {
+        FPClassTest Mask = APFloat(FPType->getFltSemantics(), *C).classify();
+        if (Mask & (fcInf | fcZero)) {
+          if (Pred == ICmpInst::ICMP_NE)
+            Mask = ~Mask;
+          return replaceInstUsesWith(Cmp,
+                                     Builder.createIsFPClass(BCSrcOp, Mask));
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 97459a8fbd65..7f6618fc5b73 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -566,6 +566,15 @@ public:
                                     APInt &PoisonElts, unsigned Depth = 0,
                                     bool AllowMultipleUsers = false) override;
 
+  /// Attempts to replace V with a simpler value based on the demanded
+  /// floating-point classes
+  Value *SimplifyDemandedUseFPClass(Value *V, FPClassTest DemandedMask,
+                                    KnownFPClass &Known, unsigned Depth,
+                                    Instruction *CxtI);
+  bool SimplifyDemandedFPClass(Instruction *I, unsigned Op,
+                               FPClassTest DemandedMask, KnownFPClass &Known,
+                               unsigned Depth = 0);
+
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f9cee9dfcfad..0bd4b6d1a835 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1299,9 +1299,6 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   }
 
   // log2(Cond ? X : Y) -> Cond ? log2(X) : log2(Y)
-  // FIXME: missed optimization: if one of the hands of select is/contains
-  //        undef, just directly pick the other one.
-  // FIXME: can both hands contain undef?
   // FIXME: Require one use?
   if (SelectInst *SI = dyn_cast<SelectInst>(Op))
     if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 3fbe98fae0b6..eafd2889ec50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1206,6 +1206,12 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(Mask, X);
   }
 
+  // Transform  (-1 >> y) << y  to -1 << y
+  if (match(Op0, m_LShr(m_AllOnes(), m_Specific(Op1)))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    return BinaryOperator::CreateShl(AllOnes, Op1);
+  }
+
   Constant *C1;
   if (match(Op1, m_Constant(C1))) {
     Constant *C2;
@@ -1493,6 +1499,12 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(Mask, X);
   }
 
+  // Transform  (-1 << y) >> y  to -1 >> y
+  if (match(Op0, m_Shl(m_AllOnes(), m_Specific(Op1)))) {
+    Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+    return BinaryOperator::CreateLShr(AllOnes, Op1);
+  }
+
   if (Instruction *Overflow = foldLShrOverflowBit(I))
     return Overflow;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 79873a9b4cbb..be6ee9d96d26 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1877,3 +1877,139 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
 
   return MadeChange ? I : nullptr;
 }
+
+/// For floating-point classes that resolve to a single bit pattern, return that
+/// value.
+static Constant *getFPClassConstant(Type *Ty, FPClassTest Mask) {
+  switch (Mask) {
+  case fcPosZero:
+    return ConstantFP::getZero(Ty);
+  case fcNegZero:
+    return ConstantFP::getZero(Ty, true);
+  case fcPosInf:
+    return ConstantFP::getInfinity(Ty);
+  case fcNegInf:
+    return ConstantFP::getInfinity(Ty, true);
+  case fcNone:
+    return PoisonValue::get(Ty);
+  default:
+    return nullptr;
+  }
+}
+
+Value *InstCombinerImpl::SimplifyDemandedUseFPClass(
+    Value *V, const FPClassTest DemandedMask, KnownFPClass &Known,
+    unsigned Depth, Instruction *CxtI) {
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
+  Type *VTy = V->getType();
+
+  assert(Known == KnownFPClass() && "expected uninitialized state");
+
+  if (DemandedMask == fcNone)
+    return isa<UndefValue>(V) ? nullptr : PoisonValue::get(VTy);
+
+  if (Depth == MaxAnalysisRecursionDepth)
+    return nullptr;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Handle constants and arguments
+    Known = computeKnownFPClass(V, fcAllFlags, CxtI, Depth + 1);
+    Value *FoldedToConst =
+        getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+    return FoldedToConst == V ? nullptr : FoldedToConst;
+  }
+
+  if (!I->hasOneUse())
+    return nullptr;
+
+  // TODO: Should account for nofpclass/FastMathFlags on current instruction
+  switch (I->getOpcode()) {
+  case Instruction::FNeg: {
+    if (SimplifyDemandedFPClass(I, 0, llvm::fneg(DemandedMask), Known,
+                                Depth + 1))
+      return I;
+    Known.fneg();
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(I);
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::fabs:
+      if (SimplifyDemandedFPClass(I, 0, llvm::inverse_fabs(DemandedMask), Known,
+                                  Depth + 1))
+        return I;
+      Known.fabs();
+      break;
+    case Intrinsic::arithmetic_fence:
+      if (SimplifyDemandedFPClass(I, 0, DemandedMask, Known, Depth + 1))
+        return I;
+      break;
+    case Intrinsic::copysign: {
+      // Flip on more potentially demanded classes
+      const FPClassTest DemandedMaskAnySign = llvm::unknown_sign(DemandedMask);
+      if (SimplifyDemandedFPClass(I, 0, DemandedMaskAnySign, Known, Depth + 1))
+        return I;
+
+      if ((DemandedMask & fcPositive) == fcNone) {
+        // Roundabout way of replacing with fneg(fabs)
+        I->setOperand(1, ConstantFP::get(VTy, -1.0));
+        return I;
+      }
+
+      if ((DemandedMask & fcNegative) == fcNone) {
+        // Roundabout way of replacing with fabs
+        I->setOperand(1, ConstantFP::getZero(VTy));
+        return I;
+      }
+
+      KnownFPClass KnownSign =
+          computeKnownFPClass(I->getOperand(1), fcAllFlags, CxtI, Depth + 1);
+      Known.copysign(KnownSign);
+      break;
+    }
+    default:
+      Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+      break;
+    }
+
+    break;
+  }
+  case Instruction::Select: {
+    KnownFPClass KnownLHS, KnownRHS;
+    if (SimplifyDemandedFPClass(I, 2, DemandedMask, KnownRHS, Depth + 1) ||
+        SimplifyDemandedFPClass(I, 1, DemandedMask, KnownLHS, Depth + 1))
+      return I;
+
+    if (KnownLHS.isKnownNever(DemandedMask))
+      return I->getOperand(2);
+    if (KnownRHS.isKnownNever(DemandedMask))
+      return I->getOperand(1);
+
+    // TODO: Recognize clamping patterns
+    Known = KnownLHS | KnownRHS;
+    break;
+  }
+  default:
+    Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+    break;
+  }
+
+  return getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+}
+
+bool InstCombinerImpl::SimplifyDemandedFPClass(Instruction *I, unsigned OpNo,
+                                               FPClassTest DemandedMask,
+                                               KnownFPClass &Known,
+                                               unsigned Depth) {
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal =
+      SimplifyDemandedUseFPClass(U.get(), DemandedMask, Known, Depth, I);
+  if (!NewVal)
+    return false;
+  if (Instruction *OpInst = dyn_cast<Instruction>(U))
+    salvageDebugInfo(*OpInst);
+
+  replaceUse(U, NewVal);
+  return true;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9e8bcbc8e156..7450f39c1e76 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3105,8 +3105,22 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) {
 }
 
 Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
-  // Nothing for now.
-  return nullptr;
+  Value *RetVal = RI.getReturnValue();
+  if (!RetVal || !AttributeFuncs::isNoFPClassCompatibleType(RetVal->getType()))
+    return nullptr;
+
+  Function *F = RI.getFunction();
+  FPClassTest ReturnClass = F->getAttributes().getRetNoFPClass();
+  if (ReturnClass == fcNone)
+    return nullptr;
+
+  KnownFPClass KnownClass;
+  Value *Simplified =
+      SimplifyDemandedUseFPClass(RetVal, ~ReturnClass, KnownClass, 0, &RI);
+  if (!Simplified)
+    return nullptr;
+
+  return ReturnInst::Create(RI.getContext(), Simplified);
 }
 
 // WARNING: keep in sync with SimplifyCFGOpt::simplifyUnreachable()!
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 424f1d433606..ee9aa73ff034 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMInstrumentation
   InstrOrderFile.cpp
   InstrProfiling.cpp
   KCFI.cpp
+  PGOForceFunctionAttrs.cpp
   PGOInstrumentation.cpp
   PGOMemOPSizeOpt.cpp
   PoisonChecking.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index c2a46320b14a..393afc915205 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1371,6 +1371,14 @@ static bool isLifetimeIntrinsic(Value *V) {
   return II && II->isLifetimeStartOrEnd();
 }
 
+static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
+  return dyn_cast<DbgAssignIntrinsic>(DVI);
+}
+
+static DPValue *DynCastToDbgAssign(DPValue *DPV) {
+  return DPV->isDbgAssign() ? DPV : nullptr;
+}
+
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
                                          Value *StackTag, Value *UARTag,
                                          const DominatorTree &DT,
@@ -1437,6 +1445,11 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DPtr->getVariableLocationOp(LocNo) == AI)
           DPtr->setExpression(DIExpression::appendOpsToArg(
               DPtr->getExpression(), NewOps, LocNo));
+      if (auto *DAI = DynCastToDbgAssign(DPtr)) {
+        if (DAI->getAddress() == AI)
+          DAI->setAddressExpression(DIExpression::prependOpcodes(
+              DAI->getAddressExpression(), NewOps));
+      }
     };
 
     llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
new file mode 100644
index 000000000000..c0ebdd7ed886
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+static bool shouldRunOnFunction(Function &F, ProfileSummaryInfo &PSI,
+                                FunctionAnalysisManager &FAM) {
+  if (F.isDeclaration())
+    return false;
+  // Respect existing attributes.
+  if (F.hasOptNone() || F.hasOptSize() || F.hasMinSize())
+    return false;
+  if (F.hasFnAttribute(Attribute::Cold))
+    return true;
+  if (!PSI.hasProfileSummary())
+    return false;
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  return PSI.isFunctionColdInCallGraph(&F, BFI);
+}
+
+PreservedAnalyses PGOForceFunctionAttrsPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  if (ColdType == PGOOptions::ColdFuncOpt::Default)
+    return PreservedAnalyses::all();
+  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  bool MadeChange = false;
+  for (Function &F : M) {
+    if (!shouldRunOnFunction(F, PSI, FAM))
+      continue;
+    MadeChange = true;
+    switch (ColdType) {
+    case PGOOptions::ColdFuncOpt::Default:
+      llvm_unreachable("bailed out for default above");
+      break;
+    case PGOOptions::ColdFuncOpt::OptSize:
+      F.addFnAttr(Attribute::OptimizeForSize);
+      break;
+    case PGOOptions::ColdFuncOpt::MinSize:
+      F.addFnAttr(Attribute::MinSize);
+      break;
+    case PGOOptions::ColdFuncOpt::OptNone:
+      F.addFnAttr(Attribute::OptimizeNone);
+      F.addFnAttr(Attribute::NoInline);
+      break;
+    }
+  }
+  return MadeChange ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 2dd27037a17d..ba09ebf8b04c 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMScalarOpts
   InferAlignment.cpp
   InstSimplifyPass.cpp
   JumpThreading.cpp
+  JumpTableToSwitch.cpp
   LICM.cpp
   LoopAccessAnalysisPrinter.cpp
   LoopBoundSplit.cpp
@@ -40,7 +41,6 @@ add_llvm_component_library(LLVMScalarOpts
   LoopLoadElimination.cpp
   LoopPassManager.cpp
   LoopPredication.cpp
-  LoopRerollPass.cpp
   LoopRotation.cpp
   LoopSimplifyCFG.cpp
   LoopStrengthReduce.cpp
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 24e1677bf643..9235850de92f 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -371,7 +371,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
   { // Scope for SwitchInstProfUpdateWrapper. It must not live during
     // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
     SwitchInstProfUpdateWrapper SI(*I);
-    unsigned ReachableCaseCount = 0;
 
     for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
@@ -408,33 +407,6 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
 
       // Increment the case iterator since we didn't delete it.
       ++CI;
-      ++ReachableCaseCount;
-    }
-
-    BasicBlock *DefaultDest = SI->getDefaultDest();
-    if (ReachableCaseCount > 1 &&
-        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())) {
-      ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0),
-                                                    /*UndefAllowed*/ false);
-      // The default dest is unreachable if all cases are covered.
-      if (!CR.isSizeLargerThan(ReachableCaseCount)) {
-        BasicBlock *NewUnreachableBB =
-            BasicBlock::Create(BB->getContext(), "default.unreachable",
-                               BB->getParent(), DefaultDest);
-        new UnreachableInst(BB->getContext(), NewUnreachableBB);
-
-        DefaultDest->removePredecessor(BB);
-        SI->setDefaultDest(NewUnreachableBB);
-
-        if (SuccessorsCount[DefaultDest] == 1)
-          DTU.applyUpdatesPermissive(
-              {{DominatorTree::Delete, BB, DefaultDest}});
-        DTU.applyUpdatesPermissive(
-            {{DominatorTree::Insert, BB, NewUnreachableBB}});
-
-        ++NumDeadCases;
-        Changed = true;
-      }
     }
   }
 
@@ -1255,12 +1227,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!Changed) {
     PA = PreservedAnalyses::all();
   } else {
-#if defined(EXPENSIVE_CHECKS)
-    assert(DT->verify(DominatorTree::VerificationLevel::Full));
-#else
-    assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-#endif // EXPENSIVE_CHECKS
-
     PA.preserve<DominatorTreeAnalysis>();
     PA.preserve<LazyValueAnalysis>();
   }
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
new file mode 100644
index 000000000000..f9712dbbf68a
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -0,0 +1,190 @@
+//===- JumpTableToSwitch.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
+                           cl::desc("Only split jump tables with size less or "
+                                    "equal than JumpTableSizeThreshold."),
+                           cl::init(10));
+
+// TODO: Consider adding a cost model for profitability analysis of this
+// transformation. Currently we replace a jump table with a switch if all the
+// functions in the jump table are smaller than the provided threshold.
+static cl::opt<unsigned> FunctionSizeThreshold(
+    "jump-table-to-switch-function-size-threshold", cl::Hidden,
+    cl::desc("Only split jump tables containing functions whose sizes are less "
+             "or equal than this threshold."),
+    cl::init(50));
+
+#define DEBUG_TYPE "jump-table-to-switch"
+
+namespace {
+struct JumpTableTy {
+  Value *Index;
+  SmallVector<Function *, 10> Funcs;
+};
+} // anonymous namespace
+
+static std::optional<JumpTableTy> parseJumpTable(GetElementPtrInst *GEP,
+                                                 PointerType *PtrTy) {
+  Constant *Ptr = dyn_cast<Constant>(GEP->getPointerOperand());
+  if (!Ptr)
+    return std::nullopt;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return std::nullopt;
+
+  Function &F = *GEP->getParent()->getParent();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const unsigned BitWidth =
+      DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
+  MapVector<Value *, APInt> VariableOffsets;
+  APInt ConstantOffset(BitWidth, 0);
+  if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset))
+    return std::nullopt;
+  if (VariableOffsets.size() != 1)
+    return std::nullopt;
+  // TODO: consider supporting more general patterns
+  if (!ConstantOffset.isZero())
+    return std::nullopt;
+  APInt StrideBytes = VariableOffsets.front().second;
+  const uint64_t JumpTableSizeBytes = DL.getTypeAllocSize(GV->getValueType());
+  if (JumpTableSizeBytes % StrideBytes.getZExtValue() != 0)
+    return std::nullopt;
+  const uint64_t N = JumpTableSizeBytes / StrideBytes.getZExtValue();
+  if (N > JumpTableSizeThreshold)
+    return std::nullopt;
+
+  JumpTableTy JumpTable;
+  JumpTable.Index = VariableOffsets.front().first;
+  JumpTable.Funcs.reserve(N);
+  for (uint64_t Index = 0; Index < N; ++Index) {
+    // ConstantOffset is zero.
+    APInt Offset = Index * StrideBytes;
+    Constant *C =
+        ConstantFoldLoadFromConst(GV->getInitializer(), PtrTy, Offset, DL);
+    auto *Func = dyn_cast_or_null<Function>(C);
+    if (!Func || Func->isDeclaration() ||
+        Func->getInstructionCount() > FunctionSizeThreshold)
+      return std::nullopt;
+    JumpTable.Funcs.push_back(Func);
+  }
+  return JumpTable;
+}
+
+static BasicBlock *expandToSwitch(CallBase *CB, const JumpTableTy &JT,
+                                  DomTreeUpdater &DTU,
+                                  OptimizationRemarkEmitter &ORE) {
+  const bool IsVoid = CB->getType() == Type::getVoidTy(CB->getContext());
+
+  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+  BasicBlock *BB = CB->getParent();
+  BasicBlock *Tail = SplitBlock(BB, CB, &DTU, nullptr, nullptr,
+                                BB->getName() + Twine(".tail"));
+  DTUpdates.push_back({DominatorTree::Delete, BB, Tail});
+  BB->getTerminator()->eraseFromParent();
+
+  Function &F = *BB->getParent();
+  BasicBlock *BBUnreachable = BasicBlock::Create(
+      F.getContext(), "default.switch.case.unreachable", &F, Tail);
+  IRBuilder<> BuilderUnreachable(BBUnreachable);
+  BuilderUnreachable.CreateUnreachable();
+
+  IRBuilder<> Builder(BB);
+  SwitchInst *Switch = Builder.CreateSwitch(JT.Index, BBUnreachable);
+  DTUpdates.push_back({DominatorTree::Insert, BB, BBUnreachable});
+
+  IRBuilder<> BuilderTail(CB);
+  PHINode *PHI =
+      IsVoid ? nullptr : BuilderTail.CreatePHI(CB->getType(), JT.Funcs.size());
+
+  for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {
+    BasicBlock *B = BasicBlock::Create(Func->getContext(),
+                                       "call." + Twine(Index), &F, Tail);
+    DTUpdates.push_back({DominatorTree::Insert, BB, B});
+    DTUpdates.push_back({DominatorTree::Insert, B, Tail});
+
+    CallBase *Call = cast<CallBase>(CB->clone());
+    Call->setCalledFunction(Func);
+    Call->insertInto(B, B->end());
+    Switch->addCase(
+        cast<ConstantInt>(ConstantInt::get(JT.Index->getType(), Index)), B);
+    BranchInst::Create(Tail, B);
+    if (PHI)
+      PHI->addIncoming(Call, B);
+  }
+  DTU.applyUpdates(DTUpdates);
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "ReplacedJumpTableWithSwitch", CB)
+           << "expanded indirect call into switch";
+  });
+  if (PHI)
+    CB->replaceAllUsesWith(PHI);
+  CB->eraseFromParent();
+  return Tail;
+}
+
+PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  DominatorTree *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
+  bool Changed = false;
+  for (BasicBlock &BB : make_early_inc_range(F)) {
+    BasicBlock *CurrentBB = &BB;
+    while (CurrentBB) {
+      BasicBlock *SplittedOutTail = nullptr;
+      for (Instruction &I : make_early_inc_range(*CurrentBB)) {
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (!Call || Call->getCalledFunction() || Call->isMustTailCall())
+          continue;
+        auto *L = dyn_cast<LoadInst>(Call->getCalledOperand());
+        // Skip atomic or volatile loads.
+        if (!L || !L->isSimple())
+          continue;
+        auto *GEP = dyn_cast<GetElementPtrInst>(L->getPointerOperand());
+        if (!GEP)
+          continue;
+        auto *PtrTy = dyn_cast<PointerType>(L->getType());
+        assert(PtrTy && "call operand must be a pointer");
+        std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
+        if (!JumpTable)
+          continue;
+        SplittedOutTail = expandToSwitch(Call, *JumpTable, DTU, ORE);
+        Changed = true;
+        break;
+      }
+      CurrentBB = SplittedOutTail ? SplittedOutTail : nullptr;
+    }
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  if (DT)
+    PA.preserve<DominatorTreeAnalysis>();
+  if (PDT)
+    PA.preserve<PostDominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index f3e40a5cb809..9ec9c31e48e0 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -110,6 +110,9 @@ STATISTIC(NumAddSubHoisted, "Number of add/subtract expressions reassociated "
                             "and hoisted out of the loop");
 STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
                                     "reassociated and hoisted out of the loop");
+STATISTIC(NumIntAssociationsHoisted,
+          "Number of invariant int expressions "
+          "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -135,6 +138,12 @@ static cl::opt<unsigned> FPAssociationUpperLimit(
         "Set upper limit for the number of transformations performed "
         "during a single round of hoisting the reassociated expressions."));
 
+cl::opt<unsigned> IntAssociationUpperLimit(
+    "licm-max-num-int-reassociations", cl::init(5U), cl::Hidden,
+    cl::desc(
+        "Set upper limit for the number of transformations performed "
+        "during a single round of hoisting the reassociated expressions."));
+
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. LICM calls MemorySSAWalker's
@@ -2661,21 +2670,29 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   return false;
 }
 
+static bool isReassociableOp(Instruction *I, unsigned IntOpcode,
+                             unsigned FPOpcode) {
+  if (I->getOpcode() == IntOpcode)
+    return true;
+  if (I->getOpcode() == FPOpcode && I->hasAllowReassoc() &&
+      I->hasNoSignedZeros())
+    return true;
+  return false;
+}
+
 /// Try to reassociate expressions like ((A1 * B1) + (A2 * B2) + ...) * C where
 /// A1, A2, ... and C are loop invariants into expressions like
 /// ((A1 * C * B1) + (A2 * C * B2) + ...) and hoist the (A1 * C), (A2 * C), ...
 /// invariant expressions. This functions returns true only if any hoisting has
 /// actually occured.
-static bool hoistFPAssociation(Instruction &I, Loop &L,
-                               ICFLoopSafetyInfo &SafetyInfo,
-                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
-                               DominatorTree *DT) {
-  using namespace PatternMatch;
-  Value *VariantOp = nullptr, *InvariantOp = nullptr;
-
-  if (!match(&I, m_FMul(m_Value(VariantOp), m_Value(InvariantOp))) ||
-      !I.hasAllowReassoc() || !I.hasNoSignedZeros())
+static bool hoistMulAddAssociation(Instruction &I, Loop &L,
+                                   ICFLoopSafetyInfo &SafetyInfo,
+                                   MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                                   DominatorTree *DT) {
+  if (!isReassociableOp(&I, Instruction::Mul, Instruction::FMul))
     return false;
+  Value *VariantOp = I.getOperand(0);
+  Value *InvariantOp = I.getOperand(1);
   if (L.isLoopInvariant(VariantOp))
     std::swap(VariantOp, InvariantOp);
   if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp))
@@ -2689,15 +2706,17 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
     Worklist.push_back(VariantBinOp);
   while (!Worklist.empty()) {
     BinaryOperator *BO = Worklist.pop_back_val();
-    if (!BO->hasOneUse() || !BO->hasAllowReassoc() || !BO->hasNoSignedZeros())
+    if (!BO->hasOneUse())
       return false;
-    BinaryOperator *Op0, *Op1;
-    if (match(BO, m_FAdd(m_BinOp(Op0), m_BinOp(Op1)))) {
-      Worklist.push_back(Op0);
-      Worklist.push_back(Op1);
+    if (isReassociableOp(BO, Instruction::Add, Instruction::FAdd) &&
+        isa<BinaryOperator>(BO->getOperand(0)) &&
+        isa<BinaryOperator>(BO->getOperand(1))) {
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
+      Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
       continue;
     }
-    if (BO->getOpcode() != Instruction::FMul || L.isLoopInvariant(BO))
+    if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
+        L.isLoopInvariant(BO))
       return false;
     Use &U0 = BO->getOperandUse(0);
     Use &U1 = BO->getOperandUse(1);
@@ -2707,7 +2726,10 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
       Changes.push_back(&U1);
     else
       return false;
-    if (Changes.size() > FPAssociationUpperLimit)
+    unsigned Limit = I.getType()->isIntOrIntVectorTy()
+                         ? IntAssociationUpperLimit
+                         : FPAssociationUpperLimit;
+    if (Changes.size() > Limit)
       return false;
   }
   if (Changes.empty())
@@ -2720,7 +2742,12 @@ static bool hoistFPAssociation(Instruction &I, Loop &L,
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
-    U->set(Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul"));
+    Value *Mul;
+    if (I.getType()->isIntOrIntVectorTy())
+      Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
+    else
+      Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
+    U->set(Mul);
   }
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
@@ -2754,9 +2781,13 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
-  if (hoistFPAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+  bool IsInt = I.getType()->isIntOrIntVectorTy();
+  if (hoistMulAddAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
     ++NumHoisted;
-    ++NumFPAssociationsHoisted;
+    if (IsInt)
+      ++NumIntAssociationsHoisted;
+    else
+      ++NumFPAssociationsHoisted;
     return true;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
deleted file mode 100644
index 7f62526a4f6d..000000000000
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ /dev/null
@@ -1,1679 +0,0 @@
-//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements a simple loop reroller.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/LoopReroll.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-reroll"
-
-STATISTIC(NumRerolledLoops, "Number of rerolled loops");
-
-static cl::opt<unsigned>
-NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
-                          cl::Hidden,
-                          cl::desc("The maximum number of failures to tolerate"
-                                   " during fuzzy matching. (default: 400)"));
-
-// This loop re-rolling transformation aims to transform loops like this:
-//
-// int foo(int a);
-// void bar(int *x) {
-//   for (int i = 0; i < 500; i += 3) {
-//     foo(i);
-//     foo(i+1);
-//     foo(i+2);
-//   }
-// }
-//
-// into a loop like this:
-//
-// void bar(int *x) {
-//   for (int i = 0; i < 500; ++i)
-//     foo(i);
-// }
-//
-// It does this by looking for loops that, besides the latch code, are composed
-// of isomorphic DAGs of instructions, with each DAG rooted at some increment
-// to the induction variable, and where each DAG is isomorphic to the DAG
-// rooted at the induction variable (excepting the sub-DAGs which root the
-// other induction-variable increments). In other words, we're looking for loop
-// bodies of the form:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1                <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2                <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// where each f(i) is a set of instructions that, collectively, are a function
-// only of i (and other loop-invariant values).
-//
-// As a special case, we can also reroll loops like this:
-//
-// int foo(int);
-// void bar(int *x) {
-//   for (int i = 0; i < 500; ++i) {
-//     x[3*i] = foo(0);
-//     x[3*i+1] = foo(0);
-//     x[3*i+2] = foo(0);
-//   }
-// }
-//
-// into this:
-//
-// void bar(int *x) {
-//   for (int i = 0; i < 1500; ++i)
-//     x[i] = foo(0);
-// }
-//
-// in which case, we're looking for inputs like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// %scaled.iv = mul %iv, scale
-// f(%scaled.iv)
-// %scaled.iv.1 = add %scaled.iv, 1
-// f(%scaled.iv.1)
-// %scaled.iv.2 = add %scaled.iv, 2
-// f(%scaled.iv.2)
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
-// f(%scaled.iv.scale_m_1)
-// ...
-// %iv.next = add %iv, 1
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-
-namespace {
-
-  enum IterationLimits {
-    /// The maximum number of iterations that we'll try and reroll.
-    IL_MaxRerollIterations = 32,
-    /// The bitvector index used by loop induction variables and other
-    /// instructions that belong to all iterations.
-    IL_All,
-    IL_End
-  };
-
-  class LoopReroll {
-  public:
-    LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
-               TargetLibraryInfo *TLI, DominatorTree *DT, bool PreserveLCSSA)
-        : AA(AA), LI(LI), SE(SE), TLI(TLI), DT(DT),
-          PreserveLCSSA(PreserveLCSSA) {}
-    bool runOnLoop(Loop *L);
-
-  protected:
-    AliasAnalysis *AA;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    TargetLibraryInfo *TLI;
-    DominatorTree *DT;
-    bool PreserveLCSSA;
-
-    using SmallInstructionVector = SmallVector<Instruction *, 16>;
-    using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
-    using TinyInstructionVector = SmallVector<Instruction *, 1>;
-
-    // Map between induction variable and its increment
-    DenseMap<Instruction *, int64_t> IVToIncMap;
-
-    // For loop with multiple induction variables, remember the ones used only to
-    // control the loop.
-    TinyInstructionVector LoopControlIVs;
-
-    // A chain of isomorphic instructions, identified by a single-use PHI
-    // representing a reduction. Only the last value may be used outside the
-    // loop.
-    struct SimpleLoopReduction {
-      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
-        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
-        add(L);
-      }
-
-      bool valid() const {
-        return Valid;
-      }
-
-      Instruction *getPHI() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.front();
-      }
-
-      Instruction *getReducedValue() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.back();
-      }
-
-      Instruction *get(size_t i) const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions[i+1];
-      }
-
-      Instruction *operator [] (size_t i) const { return get(i); }
-
-      // The size, ignoring the initial PHI.
-      size_t size() const {
-        assert(Valid && "Using invalid reduction");
-        return Instructions.size()-1;
-      }
-
-      using iterator = SmallInstructionVector::iterator;
-      using const_iterator = SmallInstructionVector::const_iterator;
-
-      iterator begin() {
-        assert(Valid && "Using invalid reduction");
-        return std::next(Instructions.begin());
-      }
-
-      const_iterator begin() const {
-        assert(Valid && "Using invalid reduction");
-        return std::next(Instructions.begin());
-      }
-
-      iterator end() { return Instructions.end(); }
-      const_iterator end() const { return Instructions.end(); }
-
-    protected:
-      bool Valid = false;
-      SmallInstructionVector Instructions;
-
-      void add(Loop *L);
-    };
-
-    // The set of all reductions, and state tracking of possible reductions
-    // during loop instruction processing.
-    struct ReductionTracker {
-      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
-
-      // Add a new possible reduction.
-      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
-
-      // Setup to track possible reductions corresponding to the provided
-      // rerolling scale. Only reductions with a number of non-PHI instructions
-      // that is divisible by the scale are considered. Three instructions sets
-      // are filled in:
-      //   - A set of all possible instructions in eligible reductions.
-      //   - A set of all PHIs in eligible reductions
-      //   - A set of all reduced values (last instructions) in eligible
-      //     reductions.
-      void restrictToScale(uint64_t Scale,
-                           SmallInstructionSet &PossibleRedSet,
-                           SmallInstructionSet &PossibleRedPHISet,
-                           SmallInstructionSet &PossibleRedLastSet) {
-        PossibleRedIdx.clear();
-        PossibleRedIter.clear();
-        Reds.clear();
-
-        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
-          if (PossibleReds[i].size() % Scale == 0) {
-            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
-            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
-
-            PossibleRedSet.insert(PossibleReds[i].getPHI());
-            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
-            for (Instruction *J : PossibleReds[i]) {
-              PossibleRedSet.insert(J);
-              PossibleRedIdx[J] = i;
-            }
-          }
-      }
-
-      // The functions below are used while processing the loop instructions.
-
-      // Are the two instructions both from reductions, and furthermore, from
-      // the same reduction?
-      bool isPairInSame(Instruction *J1, Instruction *J2) {
-        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
-        if (J1I != PossibleRedIdx.end()) {
-          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
-          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
-            return true;
-        }
-
-        return false;
-      }
-
-      // The two provided instructions, the first from the base iteration, and
-      // the second from iteration i, form a matched pair. If these are part of
-      // a reduction, record that fact.
-      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
-        if (PossibleRedIdx.count(J1)) {
-          assert(PossibleRedIdx.count(J2) &&
-                 "Recording reduction vs. non-reduction instruction?");
-
-          PossibleRedIter[J1] = 0;
-          PossibleRedIter[J2] = i;
-
-          int Idx = PossibleRedIdx[J1];
-          assert(Idx == PossibleRedIdx[J2] &&
-                 "Recording pair from different reductions?");
-          Reds.insert(Idx);
-        }
-      }
-
-      // The functions below can be called after we've finished processing all
-      // instructions in the loop, and we know which reductions were selected.
-
-      bool validateSelected();
-      void replaceSelected();
-
-    protected:
-      // The vector of all possible reductions (for any scale).
-      SmallReductionVector PossibleReds;
-
-      DenseMap<Instruction *, int> PossibleRedIdx;
-      DenseMap<Instruction *, int> PossibleRedIter;
-      DenseSet<int> Reds;
-    };
-
-    // A DAGRootSet models an induction variable being used in a rerollable
-    // loop. For example,
-    //
-    //   x[i*3+0] = y1
-    //   x[i*3+1] = y2
-    //   x[i*3+2] = y3
-    //
-    //   Base instruction -> i*3
-    //                    +---+----+
-    //                   /    |     \
-    //               ST[y1]  +1     +2  <-- Roots
-    //                        |      |
-    //                      ST[y2] ST[y3]
-    //
-    // There may be multiple DAGRoots, for example:
-    //
-    //   x[i*2+0] = ...   (1)
-    //   x[i*2+1] = ...   (1)
-    //   x[i*2+4] = ...   (2)
-    //   x[i*2+5] = ...   (2)
-    //   x[(i+1234)*2+5678] = ... (3)
-    //   x[(i+1234)*2+5679] = ... (3)
-    //
-    // The loop will be rerolled by adding a new loop induction variable,
-    // one for the Base instruction in each DAGRootSet.
-    //
-    struct DAGRootSet {
-      Instruction *BaseInst;
-      SmallInstructionVector Roots;
-
-      // The instructions between IV and BaseInst (but not including BaseInst).
-      SmallInstructionSet SubsumedInsts;
-    };
-
-    // The set of all DAG roots, and state tracking of all roots
-    // for a particular induction variable.
-    struct DAGRootTracker {
-      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
-                     ScalarEvolution *SE, AliasAnalysis *AA,
-                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
-                     bool PreserveLCSSA,
-                     DenseMap<Instruction *, int64_t> &IncrMap,
-                     TinyInstructionVector LoopCtrlIVs)
-          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
-            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
-            LoopControlIVs(LoopCtrlIVs) {}
-
-      /// Stage 1: Find all the DAG roots for the induction variable.
-      bool findRoots();
-
-      /// Stage 2: Validate if the found roots are valid.
-      bool validate(ReductionTracker &Reductions);
-
-      /// Stage 3: Assuming validate() returned true, perform the
-      /// replacement.
-      /// @param BackedgeTakenCount The backedge-taken count of L.
-      void replace(const SCEV *BackedgeTakenCount);
-
-    protected:
-      using UsesTy = MapVector<Instruction *, BitVector>;
-
-      void findRootsRecursive(Instruction *IVU,
-                              SmallInstructionSet SubsumedInsts);
-      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
-      bool collectPossibleRoots(Instruction *Base,
-                                std::map<int64_t,Instruction*> &Roots);
-      bool validateRootSet(DAGRootSet &DRS);
-
-      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
-      void collectInLoopUserSet(const SmallInstructionVector &Roots,
-                                const SmallInstructionSet &Exclude,
-                                const SmallInstructionSet &Final,
-                                DenseSet<Instruction *> &Users);
-      void collectInLoopUserSet(Instruction *Root,
-                                const SmallInstructionSet &Exclude,
-                                const SmallInstructionSet &Final,
-                                DenseSet<Instruction *> &Users);
-
-      UsesTy::iterator nextInstr(int Val, UsesTy &In,
-                                 const SmallInstructionSet &Exclude,
-                                 UsesTy::iterator *StartI=nullptr);
-      bool isBaseInst(Instruction *I);
-      bool isRootInst(Instruction *I);
-      bool instrDependsOn(Instruction *I,
-                          UsesTy::iterator Start,
-                          UsesTy::iterator End);
-      void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
-
-      LoopReroll *Parent;
-
-      // Members of Parent, replicated here for brevity.
-      Loop *L;
-      ScalarEvolution *SE;
-      AliasAnalysis *AA;
-      TargetLibraryInfo *TLI;
-      DominatorTree *DT;
-      LoopInfo *LI;
-      bool PreserveLCSSA;
-
-      // The loop induction variable.
-      Instruction *IV;
-
-      // Loop step amount.
-      int64_t Inc;
-
-      // Loop reroll count; if Inc == 1, this records the scaling applied
-      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
-      // If Inc is not 1, Scale = Inc.
-      uint64_t Scale;
-
-      // The roots themselves.
-      SmallVector<DAGRootSet,16> RootSets;
-
-      // All increment instructions for IV.
-      SmallInstructionVector LoopIncs;
-
-      // Map of all instructions in the loop (in order) to the iterations
-      // they are used in (or specially, IL_All for instructions
-      // used in the loop increment mechanism).
-      UsesTy Uses;
-
-      // Map between induction variable and its increment
-      DenseMap<Instruction *, int64_t> &IVToIncMap;
-
-      TinyInstructionVector LoopControlIVs;
-    };
-
-    // Check if it is a compare-like instruction whose user is a branch
-    bool isCompareUsedByBranch(Instruction *I) {
-      auto *TI = I->getParent()->getTerminator();
-      if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
-        return false;
-      return I->hasOneUse() && TI->getOperand(0) == I;
-    };
-
-    bool isLoopControlIV(Loop *L, Instruction *IV);
-    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
-    void collectPossibleReductions(Loop *L,
-           ReductionTracker &Reductions);
-    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
-  };
-
-} // end anonymous namespace
-
-// Returns true if the provided instruction is used outside the given loop.
-// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
-// non-loop blocks to be outside the loop.
-static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
-  for (User *U : I->users()) {
-    if (!L->contains(cast<Instruction>(U)))
-      return true;
-  }
-  return false;
-}
-
-// Check if an IV is only used to control the loop. There are two cases:
-// 1. It only has one use which is loop increment, and the increment is only
-// used by comparison and the PHI (could has sext with nsw in between), and the
-// comparison is only used by branch.
-// 2. It is used by loop increment and the comparison, the loop increment is
-// only used by the PHI, and the comparison is used only by the branch.
-bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
-  unsigned IVUses = IV->getNumUses();
-  if (IVUses != 2 && IVUses != 1)
-    return false;
-
-  for (auto *User : IV->users()) {
-    int32_t IncOrCmpUses = User->getNumUses();
-    bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
-
-    // User can only have one or two uses.
-    if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
-      return false;
-
-    // Case 1
-    if (IVUses == 1) {
-      // The only user must be the loop increment.
-      // The loop increment must have two uses.
-      if (IsCompInst || IncOrCmpUses != 2)
-        return false;
-    }
-
-    // Case 2
-    if (IVUses == 2 && IncOrCmpUses != 1)
-      return false;
-
-    // The users of the IV must be a binary operation or a comparison
-    if (auto *BO = dyn_cast<BinaryOperator>(User)) {
-      if (BO->getOpcode() == Instruction::Add) {
-        // Loop Increment
-        // User of Loop Increment should be either PHI or CMP
-        for (auto *UU : User->users()) {
-          if (PHINode *PN = dyn_cast<PHINode>(UU)) {
-            if (PN != IV)
-              return false;
-          }
-          // Must be a CMP or an ext (of a value with nsw) then CMP
-          else {
-            auto *UUser = cast<Instruction>(UU);
-            // Skip SExt if we are extending an nsw value
-            // TODO: Allow ZExt too
-            if (BO->hasNoSignedWrap() && UUser->hasOneUse() &&
-                isa<SExtInst>(UUser))
-              UUser = cast<Instruction>(*(UUser->user_begin()));
-            if (!isCompareUsedByBranch(UUser))
-              return false;
-          }
-        }
-      } else
-        return false;
-      // Compare : can only have one use, and must be branch
-    } else if (!IsCompInst)
-      return false;
-  }
-  return true;
-}
-
-// Collect the list of loop induction variables with respect to which it might
-// be possible to reroll the loop.
-void LoopReroll::collectPossibleIVs(Loop *L,
-                                    SmallInstructionVector &PossibleIVs) {
-  for (Instruction &IV : L->getHeader()->phis()) {
-    if (!IV.getType()->isIntegerTy() && !IV.getType()->isPointerTy())
-      continue;
-
-    if (const SCEVAddRecExpr *PHISCEV =
-            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&IV))) {
-      if (PHISCEV->getLoop() != L)
-        continue;
-      if (!PHISCEV->isAffine())
-        continue;
-      const auto *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
-      if (IncSCEV) {
-        IVToIncMap[&IV] = IncSCEV->getValue()->getSExtValue();
-        LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << IV << " = " << *PHISCEV
-                          << "\n");
-
-        if (isLoopControlIV(L, &IV)) {
-          LoopControlIVs.push_back(&IV);
-          LLVM_DEBUG(dbgs() << "LRR: Loop control only IV: " << IV
-                            << " = " << *PHISCEV << "\n");
-        } else
-          PossibleIVs.push_back(&IV);
-      }
-    }
-  }
-}
-
-// Add the remainder of the reduction-variable chain to the instruction vector
-// (the initial PHINode has already been added). If successful, the object is
-// marked as valid.
-void LoopReroll::SimpleLoopReduction::add(Loop *L) {
-  assert(!Valid && "Cannot add to an already-valid chain");
-
-  // The reduction variable must be a chain of single-use instructions
-  // (including the PHI), except for the last value (which is used by the PHI
-  // and also outside the loop).
-  Instruction *C = Instructions.front();
-  if (C->user_empty())
-    return;
-
-  do {
-    C = cast<Instruction>(*C->user_begin());
-    if (C->hasOneUse()) {
-      if (!C->isBinaryOp())
-        return;
-
-      if (!(isa<PHINode>(Instructions.back()) ||
-            C->isSameOperationAs(Instructions.back())))
-        return;
-
-      Instructions.push_back(C);
-    }
-  } while (C->hasOneUse());
-
-  if (Instructions.size() < 2 ||
-      !C->isSameOperationAs(Instructions.back()) ||
-      C->use_empty())
-    return;
-
-  // C is now the (potential) last instruction in the reduction chain.
-  for (User *U : C->users()) {
-    // The only in-loop user can be the initial PHI.
-    if (L->contains(cast<Instruction>(U)))
-      if (cast<Instruction>(U) != Instructions.front())
-        return;
-  }
-
-  Instructions.push_back(C);
-  Valid = true;
-}
-
-// Collect the vector of possible reduction variables.
-void LoopReroll::collectPossibleReductions(Loop *L,
-  ReductionTracker &Reductions) {
-  BasicBlock *Header = L->getHeader();
-  for (BasicBlock::iterator I = Header->begin(),
-       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
-    if (!isa<PHINode>(I))
-      continue;
-    if (!I->getType()->isSingleValueType())
-      continue;
-
-    SimpleLoopReduction SLR(&*I, L);
-    if (!SLR.valid())
-      continue;
-
-    LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
-                      << SLR.size() << " chained instructions)\n");
-    Reductions.addSLR(SLR);
-  }
-}
-
-// Collect the set of all users of the provided root instruction. This set of
-// users contains not only the direct users of the root instruction, but also
-// all users of those users, and so on. There are two exceptions:
-//
-//   1. Instructions in the set of excluded instructions are never added to the
-//   use set (even if they are users). This is used, for example, to exclude
-//   including root increments in the use set of the primary IV.
-//
-//   2. Instructions in the set of final instructions are added to the use set
-//   if they are users, but their users are not added. This is used, for
-//   example, to prevent a reduction update from forcing all later reduction
-//   updates into the use set.
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
-  Instruction *Root, const SmallInstructionSet &Exclude,
-  const SmallInstructionSet &Final,
-  DenseSet<Instruction *> &Users) {
-  SmallInstructionVector Queue(1, Root);
-  while (!Queue.empty()) {
-    Instruction *I = Queue.pop_back_val();
-    if (!Users.insert(I).second)
-      continue;
-
-    if (!Final.count(I))
-      for (Use &U : I->uses()) {
-        Instruction *User = cast<Instruction>(U.getUser());
-        if (PHINode *PN = dyn_cast<PHINode>(User)) {
-          // Ignore "wrap-around" uses to PHIs of this loop's header.
-          if (PN->getIncomingBlock(U) == L->getHeader())
-            continue;
-        }
-
-        if (L->contains(User) && !Exclude.count(User)) {
-          Queue.push_back(User);
-        }
-      }
-
-    // We also want to collect single-user "feeder" values.
-    for (Use &U : I->operands()) {
-      if (Instruction *Op = dyn_cast<Instruction>(U))
-        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
-            !Final.count(Op))
-          Queue.push_back(Op);
-    }
-  }
-}
-
-// Collect all of the users of all of the provided root instructions (combined
-// into a single set).
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
-  const SmallInstructionVector &Roots,
-  const SmallInstructionSet &Exclude,
-  const SmallInstructionSet &Final,
-  DenseSet<Instruction *> &Users) {
-  for (Instruction *Root : Roots)
-    collectInLoopUserSet(Root, Exclude, Final, Users);
-}
-
-static bool isUnorderedLoadStore(Instruction *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isUnordered();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isUnordered();
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
-    return !MI->isVolatile();
-  return false;
-}
-
-/// Return true if IVU is a "simple" arithmetic operation.
-/// This is used for narrowing the search space for DAGRoots; only arithmetic
-/// and GEPs can be part of a DAGRoot.
-static bool isSimpleArithmeticOp(User *IVU) {
-  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
-    switch (I->getOpcode()) {
-    default: return false;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-    case Instruction::Shl:
-    case Instruction::AShr:
-    case Instruction::LShr:
-    case Instruction::GetElementPtr:
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool isLoopIncrement(User *U, Instruction *IV) {
-  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
-
-  if ((BO && BO->getOpcode() != Instruction::Add) ||
-      (!BO && !isa<GetElementPtrInst>(U)))
-    return false;
-
-  for (auto *UU : U->users()) {
-    PHINode *PN = dyn_cast<PHINode>(UU);
-    if (PN && PN == IV)
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::
-collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
-  SmallInstructionVector BaseUsers;
-
-  for (auto *I : Base->users()) {
-    ConstantInt *CI = nullptr;
-
-    if (isLoopIncrement(I, IV)) {
-      LoopIncs.push_back(cast<Instruction>(I));
-      continue;
-    }
-
-    // The root nodes must be either GEPs, ORs or ADDs.
-    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-      if (BO->getOpcode() == Instruction::Add ||
-          BO->getOpcode() == Instruction::Or)
-        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
-      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
-      CI = dyn_cast<ConstantInt>(LastOperand);
-    }
-
-    if (!CI) {
-      if (Instruction *II = dyn_cast<Instruction>(I)) {
-        BaseUsers.push_back(II);
-        continue;
-      } else {
-        LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
-                          << "\n");
-        return false;
-      }
-    }
-
-    int64_t V = std::abs(CI->getValue().getSExtValue());
-    if (Roots.find(V) != Roots.end())
-      // No duplicates, please.
-      return false;
-
-    Roots[V] = cast<Instruction>(I);
-  }
-
-  // Make sure we have at least two roots.
-  if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
-    return false;
-
-  // If we found non-loop-inc, non-root users of Base, assume they are
-  // for the zeroth root index. This is because "add %a, 0" gets optimized
-  // away.
-  if (BaseUsers.size()) {
-    if (Roots.find(0) != Roots.end()) {
-      LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
-      return false;
-    }
-    Roots[0] = Base;
-  }
-
-  // Calculate the number of users of the base, or lowest indexed, iteration.
-  unsigned NumBaseUses = BaseUsers.size();
-  if (NumBaseUses == 0)
-    NumBaseUses = Roots.begin()->second->getNumUses();
-
-  // Check that every node has the same number of users.
-  for (auto &KV : Roots) {
-    if (KV.first == 0)
-      continue;
-    if (!KV.second->hasNUses(NumBaseUses)) {
-      LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
-                        << "#Base=" << NumBaseUses
-                        << ", #Root=" << KV.second->getNumUses() << "\n");
-      return false;
-    }
-  }
-
-  return true;
-}
-
-void LoopReroll::DAGRootTracker::
-findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
-  // Does the user look like it could be part of a root set?
-  // All its users must be simple arithmetic ops.
-  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
-    return;
-
-  if (I != IV && findRootsBase(I, SubsumedInsts))
-    return;
-
-  SubsumedInsts.insert(I);
-
-  for (User *V : I->users()) {
-    Instruction *I = cast<Instruction>(V);
-    if (is_contained(LoopIncs, I))
-      continue;
-
-    if (!isSimpleArithmeticOp(I))
-      continue;
-
-    // The recursive call makes a copy of SubsumedInsts.
-    findRootsRecursive(I, SubsumedInsts);
-  }
-}
-
-bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
-  if (DRS.Roots.empty())
-    return false;
-
-  // If the value of the base instruction is used outside the loop, we cannot
-  // reroll the loop. Check for other root instructions is unnecessary because
-  // they don't match any base instructions if their values are used outside.
-  if (hasUsesOutsideLoop(DRS.BaseInst, L))
-    return false;
-
-  // Consider a DAGRootSet with N-1 roots (so N different values including
-  //   BaseInst).
-  // Define d = Roots[0] - BaseInst, which should be the same as
-  //   Roots[I] - Roots[I-1] for all I in [1..N).
-  // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
-  //   loop iteration J.
-  //
-  // Now, For the loop iterations to be consecutive:
-  //   D = d * N
-  const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
-  if (!ADR)
-    return false;
-
-  // Check that the first root is evenly spaced.
-  unsigned N = DRS.Roots.size() + 1;
-  const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
-  if (isa<SCEVCouldNotCompute>(StepSCEV) || StepSCEV->getType()->isPointerTy())
-    return false;
-  const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
-  if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
-    return false;
-
-  // Check that the remainling roots are evenly spaced.
-  for (unsigned i = 1; i < N - 1; ++i) {
-    const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
-                                               SE->getSCEV(DRS.Roots[i-1]));
-    if (NewStepSCEV != StepSCEV)
-      return false;
-  }
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::
-findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
-  // The base of a RootSet must be an AddRec, so it can be erased.
-  const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
-  if (!IVU_ADR || IVU_ADR->getLoop() != L)
-    return false;
-
-  std::map<int64_t, Instruction*> V;
-  if (!collectPossibleRoots(IVU, V))
-    return false;
-
-  // If we didn't get a root for index zero, then IVU must be
-  // subsumed.
-  if (V.find(0) == V.end())
-    SubsumedInsts.insert(IVU);
-
-  // Partition the vector into monotonically increasing indexes.
-  DAGRootSet DRS;
-  DRS.BaseInst = nullptr;
-
-  SmallVector<DAGRootSet, 16> PotentialRootSets;
-
-  for (auto &KV : V) {
-    if (!DRS.BaseInst) {
-      DRS.BaseInst = KV.second;
-      DRS.SubsumedInsts = SubsumedInsts;
-    } else if (DRS.Roots.empty()) {
-      DRS.Roots.push_back(KV.second);
-    } else if (V.find(KV.first - 1) != V.end()) {
-      DRS.Roots.push_back(KV.second);
-    } else {
-      // Linear sequence terminated.
-      if (!validateRootSet(DRS))
-        return false;
-
-      // Construct a new DAGRootSet with the next sequence.
-      PotentialRootSets.push_back(DRS);
-      DRS.BaseInst = KV.second;
-      DRS.Roots.clear();
-    }
-  }
-
-  if (!validateRootSet(DRS))
-    return false;
-
-  PotentialRootSets.push_back(DRS);
-
-  RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::findRoots() {
-  Inc = IVToIncMap[IV];
-
-  assert(RootSets.empty() && "Unclean state!");
-  if (std::abs(Inc) == 1) {
-    for (auto *IVU : IV->users()) {
-      if (isLoopIncrement(IVU, IV))
-        LoopIncs.push_back(cast<Instruction>(IVU));
-    }
-    findRootsRecursive(IV, SmallInstructionSet());
-    LoopIncs.push_back(IV);
-  } else {
-    if (!findRootsBase(IV, SmallInstructionSet()))
-      return false;
-  }
-
-  // Ensure all sets have the same size.
-  if (RootSets.empty()) {
-    LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
-    return false;
-  }
-  for (auto &V : RootSets) {
-    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
-      LLVM_DEBUG(
-          dbgs()
-          << "LRR: Aborting because not all root sets have the same size\n");
-      return false;
-    }
-  }
-
-  Scale = RootSets[0].Roots.size() + 1;
-
-  if (Scale > IL_MaxRerollIterations) {
-    LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
-                      << "#Found=" << Scale
-                      << ", #Max=" << IL_MaxRerollIterations << "\n");
-    return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
-                    << "\n");
-
-  return true;
-}
-
-bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
-  // Populate the MapVector with all instructions in the block, in order first,
-  // so we can iterate over the contents later in perfect order.
-  for (auto &I : *L->getHeader()) {
-    Uses[&I].resize(IL_End);
-  }
-
-  SmallInstructionSet Exclude;
-  for (auto &DRS : RootSets) {
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
-    Exclude.insert(DRS.BaseInst);
-  }
-  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
-
-  for (auto &DRS : RootSets) {
-    DenseSet<Instruction*> VBase;
-    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
-    for (auto *I : VBase) {
-      Uses[I].set(0);
-    }
-
-    unsigned Idx = 1;
-    for (auto *Root : DRS.Roots) {
-      DenseSet<Instruction*> V;
-      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
-
-      // While we're here, check the use sets are the same size.
-      if (V.size() != VBase.size()) {
-        LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
-        return false;
-      }
-
-      for (auto *I : V) {
-        Uses[I].set(Idx);
-      }
-      ++Idx;
-    }
-
-    // Make sure our subsumed instructions are remembered too.
-    for (auto *I : DRS.SubsumedInsts) {
-      Uses[I].set(IL_All);
-    }
-  }
-
-  // Make sure the loop increments are also accounted for.
-
-  Exclude.clear();
-  for (auto &DRS : RootSets) {
-    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
-    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
-    Exclude.insert(DRS.BaseInst);
-  }
-
-  DenseSet<Instruction*> V;
-  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
-  for (auto *I : V) {
-    if (I->mayHaveSideEffects()) {
-      LLVM_DEBUG(dbgs() << "LRR: Aborting - "
-                        << "An instruction which does not belong to any root "
-                        << "sets must not have side effects: " << *I);
-      return false;
-    }
-    Uses[I].set(IL_All);
-  }
-
-  return true;
-}
-
-/// Get the next instruction in "In" that is a member of set Val.
-/// Start searching from StartI, and do not return anything in Exclude.
-/// If StartI is not given, start from In.begin().
-LoopReroll::DAGRootTracker::UsesTy::iterator
-LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
-                                      const SmallInstructionSet &Exclude,
-                                      UsesTy::iterator *StartI) {
-  UsesTy::iterator I = StartI ? *StartI : In.begin();
-  while (I != In.end() && (I->second.test(Val) == 0 ||
-                           Exclude.contains(I->first)))
-    ++I;
-  return I;
-}
-
-bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
-  for (auto &DRS : RootSets) {
-    if (DRS.BaseInst == I)
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
-  for (auto &DRS : RootSets) {
-    if (is_contained(DRS.Roots, I))
-      return true;
-  }
-  return false;
-}
-
-/// Return true if instruction I depends on any instruction between
-/// Start and End.
-bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
-                                                UsesTy::iterator Start,
-                                                UsesTy::iterator End) {
-  for (auto *U : I->users()) {
-    for (auto It = Start; It != End; ++It)
-      if (U == It->first)
-        return true;
-  }
-  return false;
-}
-
-static bool isIgnorableInst(const Instruction *I) {
-  if (isa<DbgInfoIntrinsic>(I))
-    return true;
-  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-  if (!II)
-    return false;
-  switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::annotation:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-    // TODO: the following intrinsics may also be allowed:
-    //   lifetime_start, lifetime_end, invariant_start, invariant_end
-      return true;
-  }
-  return false;
-}
-
-bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
-  // We now need to check for equivalence of the use graph of each root with
-  // that of the primary induction variable (excluding the roots). Our goal
-  // here is not to solve the full graph isomorphism problem, but rather to
-  // catch common cases without a lot of work. As a result, we will assume
-  // that the relative order of the instructions in each unrolled iteration
-  // is the same (although we will not make an assumption about how the
-  // different iterations are intermixed). Note that while the order must be
-  // the same, the instructions may not be in the same basic block.
-
-  // An array of just the possible reductions for this scale factor. When we
-  // collect the set of all users of some root instructions, these reduction
-  // instructions are treated as 'final' (their uses are not considered).
-  // This is important because we don't want the root use set to search down
-  // the reduction chain.
-  SmallInstructionSet PossibleRedSet;
-  SmallInstructionSet PossibleRedLastSet;
-  SmallInstructionSet PossibleRedPHISet;
-  Reductions.restrictToScale(Scale, PossibleRedSet,
-                             PossibleRedPHISet, PossibleRedLastSet);
-
-  // Populate "Uses" with where each instruction is used.
-  if (!collectUsedInstructions(PossibleRedSet))
-    return false;
-
-  // Make sure we mark the reduction PHIs as used in all iterations.
-  for (auto *I : PossibleRedPHISet) {
-    Uses[I].set(IL_All);
-  }
-
-  // Make sure we mark loop-control-only PHIs as used in all iterations. See
-  // comment above LoopReroll::isLoopControlIV for more information.
-  BasicBlock *Header = L->getHeader();
-  for (Instruction *LoopControlIV : LoopControlIVs) {
-    for (auto *U : LoopControlIV->users()) {
-      Instruction *IVUser = dyn_cast<Instruction>(U);
-      // IVUser could be loop increment or compare
-      Uses[IVUser].set(IL_All);
-      for (auto *UU : IVUser->users()) {
-        Instruction *UUser = dyn_cast<Instruction>(UU);
-        // UUser could be compare, PHI or branch
-        Uses[UUser].set(IL_All);
-        // Skip SExt
-        if (isa<SExtInst>(UUser)) {
-          UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
-          Uses[UUser].set(IL_All);
-        }
-        // Is UUser a compare instruction?
-        if (UU->hasOneUse()) {
-          Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
-          if (BI == cast<BranchInst>(Header->getTerminator()))
-            Uses[BI].set(IL_All);
-        }
-      }
-    }
-  }
-
-  // Make sure all instructions in the loop are in one and only one
-  // set.
-  for (auto &KV : Uses) {
-    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
-      LLVM_DEBUG(
-          dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
-                 << *KV.first << " (#uses=" << KV.second.count() << ")\n");
-      return false;
-    }
-  }
-
-  LLVM_DEBUG(for (auto &KV
-                  : Uses) {
-    dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
-  });
-
-  BatchAAResults BatchAA(*AA);
-  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
-    // In addition to regular aliasing information, we need to look for
-    // instructions from later (future) iterations that have side effects
-    // preventing us from reordering them past other instructions with side
-    // effects.
-    bool FutureSideEffects = false;
-    AliasSetTracker AST(BatchAA);
-    // The map between instructions in f(%iv.(i+1)) and f(%iv).
-    DenseMap<Value *, Value *> BaseMap;
-
-    // Compare iteration Iter to the base.
-    SmallInstructionSet Visited;
-    auto BaseIt = nextInstr(0, Uses, Visited);
-    auto RootIt = nextInstr(Iter, Uses, Visited);
-    auto LastRootIt = Uses.begin();
-
-    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
-      Instruction *BaseInst = BaseIt->first;
-      Instruction *RootInst = RootIt->first;
-
-      // Skip over the IV or root instructions; only match their users.
-      bool Continue = false;
-      if (isBaseInst(BaseInst)) {
-        Visited.insert(BaseInst);
-        BaseIt = nextInstr(0, Uses, Visited);
-        Continue = true;
-      }
-      if (isRootInst(RootInst)) {
-        LastRootIt = RootIt;
-        Visited.insert(RootInst);
-        RootIt = nextInstr(Iter, Uses, Visited);
-        Continue = true;
-      }
-      if (Continue) continue;
-
-      if (!BaseInst->isSameOperationAs(RootInst)) {
-        // Last chance saloon. We don't try and solve the full isomorphism
-        // problem, but try and at least catch the case where two instructions
-        // *of different types* are round the wrong way. We won't be able to
-        // efficiently tell, given two ADD instructions, which way around we
-        // should match them, but given an ADD and a SUB, we can at least infer
-        // which one is which.
-        //
-        // This should allow us to deal with a greater subset of the isomorphism
-        // problem. It does however change a linear algorithm into a quadratic
-        // one, so limit the number of probes we do.
-        auto TryIt = RootIt;
-        unsigned N = NumToleratedFailedMatches;
-        while (TryIt != Uses.end() &&
-               !BaseInst->isSameOperationAs(TryIt->first) &&
-               N--) {
-          ++TryIt;
-          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
-        }
-
-        if (TryIt == Uses.end() || TryIt == RootIt ||
-            instrDependsOn(TryIt->first, RootIt, TryIt)) {
-          LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
-                            << *BaseInst << " vs. " << *RootInst << "\n");
-          return false;
-        }
-
-        RootIt = TryIt;
-        RootInst = TryIt->first;
-      }
-
-      // All instructions between the last root and this root
-      // may belong to some other iteration. If they belong to a
-      // future iteration, then they're dangerous to alias with.
-      //
-      // Note that because we allow a limited amount of flexibility in the order
-      // that we visit nodes, LastRootIt might be *before* RootIt, in which
-      // case we've already checked this set of instructions so we shouldn't
-      // do anything.
-      for (; LastRootIt < RootIt; ++LastRootIt) {
-        Instruction *I = LastRootIt->first;
-        if (LastRootIt->second.find_first() < (int)Iter)
-          continue;
-        if (I->mayWriteToMemory())
-          AST.add(I);
-        // Note: This is specifically guarded by a check on isa<PHINode>,
-        // which while a valid (somewhat arbitrary) micro-optimization, is
-        // needed because otherwise isSafeToSpeculativelyExecute returns
-        // false on PHI nodes.
-        if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
-            !isSafeToSpeculativelyExecute(I))
-          // Intervening instructions cause side effects.
-          FutureSideEffects = true;
-      }
-
-      // Make sure that this instruction, which is in the use set of this
-      // root instruction, does not also belong to the base set or the set of
-      // some other root instruction.
-      if (RootIt->second.count() > 1) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst << " (prev. case overlap)\n");
-        return false;
-      }
-
-      // Make sure that we don't alias with any instruction in the alias set
-      // tracker. If we do, then we depend on a future iteration, and we
-      // can't reroll.
-      if (RootInst->mayReadFromMemory()) {
-        for (auto &K : AST) {
-          if (isModOrRefSet(K.aliasesUnknownInst(RootInst, BatchAA))) {
-            LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
-                              << *BaseInst << " vs. " << *RootInst
-                              << " (depends on future store)\n");
-            return false;
-          }
-        }
-      }
-
-      // If we've past an instruction from a future iteration that may have
-      // side effects, and this instruction might also, then we can't reorder
-      // them, and this matching fails. As an exception, we allow the alias
-      // set tracker to handle regular (unordered) load/store dependencies.
-      if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
-                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
-                                (!isUnorderedLoadStore(RootInst) &&
-                                 !isSafeToSpeculativelyExecute(RootInst)))) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst
-                          << " (side effects prevent reordering)\n");
-        return false;
-      }
-
-      // For instructions that are part of a reduction, if the operation is
-      // associative, then don't bother matching the operands (because we
-      // already know that the instructions are isomorphic, and the order
-      // within the iteration does not matter). For non-associative reductions,
-      // we do need to match the operands, because we need to reject
-      // out-of-order instructions within an iteration!
-      // For example (assume floating-point addition), we need to reject this:
-      //   x += a[i]; x += b[i];
-      //   x += a[i+1]; x += b[i+1];
-      //   x += b[i+2]; x += a[i+2];
-      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
-
-      if (!(InReduction && BaseInst->isAssociative())) {
-        bool Swapped = false, SomeOpMatched = false;
-        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
-          Value *Op2 = RootInst->getOperand(j);
-
-          // If this is part of a reduction (and the operation is not
-          // associatve), then we match all operands, but not those that are
-          // part of the reduction.
-          if (InReduction)
-            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
-              if (Reductions.isPairInSame(RootInst, Op2I))
-                continue;
-
-          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
-          if (BMI != BaseMap.end()) {
-            Op2 = BMI->second;
-          } else {
-            for (auto &DRS : RootSets) {
-              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
-                Op2 = DRS.BaseInst;
-                break;
-              }
-            }
-          }
-
-          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
-            // If we've not already decided to swap the matched operands, and
-            // we've not already matched our first operand (note that we could
-            // have skipped matching the first operand because it is part of a
-            // reduction above), and the instruction is commutative, then try
-            // the swapped match.
-            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
-                BaseInst->getOperand(!j) == Op2) {
-              Swapped = true;
-            } else {
-              LLVM_DEBUG(dbgs()
-                         << "LRR: iteration root match failed at " << *BaseInst
-                         << " vs. " << *RootInst << " (operand " << j << ")\n");
-              return false;
-            }
-          }
-
-          SomeOpMatched = true;
-        }
-      }
-
-      if ((!PossibleRedLastSet.count(BaseInst) &&
-           hasUsesOutsideLoop(BaseInst, L)) ||
-          (!PossibleRedLastSet.count(RootInst) &&
-           hasUsesOutsideLoop(RootInst, L))) {
-        LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
-                          << " vs. " << *RootInst << " (uses outside loop)\n");
-        return false;
-      }
-
-      Reductions.recordPair(BaseInst, RootInst, Iter);
-      BaseMap.insert(std::make_pair(RootInst, BaseInst));
-
-      LastRootIt = RootIt;
-      Visited.insert(BaseInst);
-      Visited.insert(RootInst);
-      BaseIt = nextInstr(0, Uses, Visited);
-      RootIt = nextInstr(Iter, Uses, Visited);
-    }
-    assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
-           "Mismatched set sizes!");
-  }
-
-  LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
-                    << "\n");
-
-  return true;
-}
-
-void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
-  BasicBlock *Header = L->getHeader();
-
-  // Compute the start and increment for each BaseInst before we start erasing
-  // instructions.
-  SmallVector<const SCEV *, 8> StartExprs;
-  SmallVector<const SCEV *, 8> IncrExprs;
-  for (auto &DRS : RootSets) {
-    const SCEVAddRecExpr *IVSCEV =
-        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
-    StartExprs.push_back(IVSCEV->getStart());
-    IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
-  }
-
-  // Remove instructions associated with non-base iterations.
-  for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) {
-    unsigned I = Uses[&Inst].find_first();
-    if (I > 0 && I < IL_All) {
-      LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n");
-      Inst.eraseFromParent();
-    }
-  }
-
-  // Rewrite each BaseInst using SCEV.
-  for (size_t i = 0, e = RootSets.size(); i != e; ++i)
-    // Insert the new induction variable.
-    replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
-
-  { // Limit the lifetime of SCEVExpander.
-    BranchInst *BI = cast<BranchInst>(Header->getTerminator());
-    const DataLayout &DL = Header->getModule()->getDataLayout();
-    SCEVExpander Expander(*SE, DL, "reroll");
-    auto Zero = SE->getZero(BackedgeTakenCount->getType());
-    auto One = SE->getOne(BackedgeTakenCount->getType());
-    auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
-    Value *NewIV =
-        Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
-                               Header->getFirstNonPHIOrDbg());
-    // FIXME: This arithmetic can overflow.
-    auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
-    auto ScaledTripCount = SE->getMulExpr(
-        TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
-    auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
-    Value *TakenCount =
-        Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
-                               Header->getFirstNonPHIOrDbg());
-    Value *Cond =
-        new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
-    BI->setCondition(Cond);
-
-    if (BI->getSuccessor(1) != Header)
-      BI->swapSuccessors();
-  }
-
-  SimplifyInstructionsInBlock(Header, TLI);
-  DeleteDeadPHIs(Header, TLI);
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
-                                           const SCEV *Start,
-                                           const SCEV *IncrExpr) {
-  BasicBlock *Header = L->getHeader();
-  Instruction *Inst = DRS.BaseInst;
-
-  const SCEV *NewIVSCEV =
-      SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
-
-  { // Limit the lifetime of SCEVExpander.
-    const DataLayout &DL = Header->getModule()->getDataLayout();
-    SCEVExpander Expander(*SE, DL, "reroll");
-    Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
-                                          Header->getFirstNonPHIOrDbg());
-
-    for (auto &KV : Uses)
-      if (KV.second.find_first() == 0)
-        KV.first->replaceUsesOfWith(Inst, NewIV);
-  }
-}
-
-// Validate the selected reductions. All iterations must have an isomorphic
-// part of the reduction chain and, for non-associative reductions, the chain
-// entries must appear in order.
-bool LoopReroll::ReductionTracker::validateSelected() {
-  // For a non-associative reduction, the chain entries must appear in order.
-  for (int i : Reds) {
-    int PrevIter = 0, BaseCount = 0, Count = 0;
-    for (Instruction *J : PossibleReds[i]) {
-      // Note that all instructions in the chain must have been found because
-      // all instructions in the function must have been assigned to some
-      // iteration.
-      int Iter = PossibleRedIter[J];
-      if (Iter != PrevIter && Iter != PrevIter + 1 &&
-          !PossibleReds[i].getReducedValue()->isAssociative()) {
-        LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
-                          << J << "\n");
-        return false;
-      }
-
-      if (Iter != PrevIter) {
-        if (Count != BaseCount) {
-          LLVM_DEBUG(dbgs()
-                     << "LRR: Iteration " << PrevIter << " reduction use count "
-                     << Count << " is not equal to the base use count "
-                     << BaseCount << "\n");
-          return false;
-        }
-
-        Count = 0;
-      }
-
-      ++Count;
-      if (Iter == 0)
-        ++BaseCount;
-
-      PrevIter = Iter;
-    }
-  }
-
-  return true;
-}
-
-// For all selected reductions, remove all parts except those in the first
-// iteration (and the PHI). Replace outside uses of the reduced value with uses
-// of the first-iteration reduced value (in other words, reroll the selected
-// reductions).
-void LoopReroll::ReductionTracker::replaceSelected() {
-  // Fixup reductions to refer to the last instruction associated with the
-  // first iteration (not the last).
-  for (int i : Reds) {
-    int j = 0;
-    for (int e = PossibleReds[i].size(); j != e; ++j)
-      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
-        --j;
-        break;
-      }
-
-    // Replace users with the new end-of-chain value.
-    SmallInstructionVector Users;
-    for (User *U : PossibleReds[i].getReducedValue()->users()) {
-      Users.push_back(cast<Instruction>(U));
-    }
-
-    for (Instruction *User : Users)
-      User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
-                              PossibleReds[i][j]);
-  }
-}
-
-// Reroll the provided loop with respect to the provided induction variable.
-// Generally, we're looking for a loop like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1                <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2                <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
-// be intermixed with eachother. The restriction imposed by this algorithm is
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
-// etc. be the same.
-//
-// First, we collect the use set of %iv, excluding the other increment roots.
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
-// times, having collected the use set of f(%iv.(i+1)), during which we:
-//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
-//     the next unmatched instruction in f(%iv.(i+1)).
-//   - Ensure that both matched instructions don't have any external users
-//     (with the exception of last-in-chain reduction instructions).
-//   - Track the (aliasing) write set, and other side effects, of all
-//     instructions that belong to future iterations that come before the matched
-//     instructions. If the matched instructions read from that write set, then
-//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
-//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
-//     if any of these future instructions had side effects (could not be
-//     speculatively executed), and so do the matched instructions, when we
-//     cannot reorder those side-effect-producing instructions, and rerolling
-//     fails.
-//
-// Finally, we make sure that all loop instructions are either loop increment
-// roots, belong to simple latch code, parts of validated reductions, part of
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
-// have been validated), then we reroll the loop.
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
-                        const SCEV *BackedgeTakenCount,
-                        ReductionTracker &Reductions) {
-  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
-                          IVToIncMap, LoopControlIVs);
-
-  if (!DAGRoots.findRoots())
-    return false;
-  LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
-                    << "\n");
-
-  if (!DAGRoots.validate(Reductions))
-    return false;
-  if (!Reductions.validateSelected())
-    return false;
-  // At this point, we've validated the rerolling, and we're committed to
-  // making changes!
-
-  Reductions.replaceSelected();
-  DAGRoots.replace(BackedgeTakenCount);
-
-  ++NumRerolledLoops;
-  return true;
-}
-
-bool LoopReroll::runOnLoop(Loop *L) {
-  BasicBlock *Header = L->getHeader();
-  LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
-                    << Header->getName() << " (" << L->getNumBlocks()
-                    << " block(s))\n");
-
-  // For now, we'll handle only single BB loops.
-  if (L->getNumBlocks() > 1)
-    return false;
-
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
-    return false;
-
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-  LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
-  LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
-               << "\n");
-
-  // First, we need to find the induction variable with respect to which we can
-  // reroll (there may be several possible options).
-  SmallInstructionVector PossibleIVs;
-  IVToIncMap.clear();
-  LoopControlIVs.clear();
-  collectPossibleIVs(L, PossibleIVs);
-
-  if (PossibleIVs.empty()) {
-    LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
-    return false;
-  }
-
-  ReductionTracker Reductions;
-  collectPossibleReductions(L, Reductions);
-  bool Changed = false;
-
-  // For each possible IV, collect the associated possible set of 'root' nodes
-  // (i+1, i+2, etc.).
-  for (Instruction *PossibleIV : PossibleIVs)
-    if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
-      Changed = true;
-      break;
-    }
-  LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
-
-  // Trip count of L has changed so SE must be re-evaluated.
-  if (Changed)
-    SE->forgetLoop(L);
-
-  return Changed;
-}
-
-PreservedAnalyses LoopRerollPass::run(Loop &L, LoopAnalysisManager &AM,
-                                      LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &U) {
-  return LoopReroll(&AR.AA, &AR.LI, &AR.SE, &AR.TLI, &AR.DT, true).runOnLoop(&L)
-             ? getLoopPassPreservedAnalyses()
-             : PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b528762b5456..67c011b747ac 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -192,6 +192,109 @@ Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
   return VecStart;
 }
 
+namespace {
+struct ShapeInfo {
+  unsigned NumRows;
+  unsigned NumColumns;
+
+  bool IsColumnMajor;
+
+  ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
+      : NumRows(NumRows), NumColumns(NumColumns),
+        IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+
+  ShapeInfo(Value *NumRows, Value *NumColumns)
+      : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
+                  cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+
+  bool operator==(const ShapeInfo &other) {
+    return NumRows == other.NumRows && NumColumns == other.NumColumns;
+  }
+  bool operator!=(const ShapeInfo &other) { return !(*this == other); }
+
+  /// Returns true if shape-information is defined, meaning both dimensions
+  /// are != 0.
+  operator bool() const {
+    assert(NumRows == 0 || NumColumns != 0);
+    return NumRows != 0;
+  }
+
+  unsigned getStride() const {
+    if (IsColumnMajor)
+      return NumRows;
+    return NumColumns;
+  }
+
+  unsigned getNumVectors() const {
+    if (IsColumnMajor)
+      return NumColumns;
+    return NumRows;
+  }
+
+  /// Returns the transposed shape.
+  ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }
+};
+} // namespace
+
+static bool isUniformShape(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+
+  switch (I->getOpcode()) {
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul: // Scalar multiply.
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::Sub:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Return the ShapeInfo for the result of \p I, it it can be determined.
+static std::optional<ShapeInfo>
+computeShapeInfoForInst(Instruction *I,
+                        const ValueMap<Value *, ShapeInfo> &ShapeMap) {
+  Value *M;
+  Value *N;
+  Value *K;
+  if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>(
+                   m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K))))
+    return ShapeInfo(M, K);
+  if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M),
+                                                        m_Value(N)))) {
+    // Flip dimensions.
+    return ShapeInfo(N, M);
+  }
+  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                   m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M),
+                   m_Value(N))))
+    return ShapeInfo(N, M);
+  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+                   m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N))))
+    return ShapeInfo(M, N);
+  Value *MatrixA;
+  if (match(I, m_Store(m_Value(MatrixA), m_Value()))) {
+    auto OpShape = ShapeMap.find(MatrixA);
+    if (OpShape != ShapeMap.end())
+      return OpShape->second;
+  }
+
+  if (isUniformShape(I)) {
+    // Find the first operand that has a known shape and use that.
+    for (auto &Op : I->operands()) {
+      auto OpShape = ShapeMap.find(Op.get());
+      if (OpShape != ShapeMap.end())
+        return OpShape->second;
+    }
+  }
+  return std::nullopt;
+}
+
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
 ///
 /// Currently, the lowering for each matrix intrinsic is done as follows:
@@ -383,48 +486,6 @@ class LowerMatrixIntrinsics {
     }
   };
 
-  struct ShapeInfo {
-    unsigned NumRows;
-    unsigned NumColumns;
-
-    bool IsColumnMajor;
-
-    ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
-        : NumRows(NumRows), NumColumns(NumColumns),
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
-
-    ShapeInfo(Value *NumRows, Value *NumColumns)
-        : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
-                    cast<ConstantInt>(NumColumns)->getZExtValue()) {}
-
-    bool operator==(const ShapeInfo &other) {
-      return NumRows == other.NumRows && NumColumns == other.NumColumns;
-    }
-    bool operator!=(const ShapeInfo &other) { return !(*this == other); }
-
-    /// Returns true if shape-information is defined, meaning both dimensions
-    /// are != 0.
-    operator bool() const {
-      assert(NumRows == 0 || NumColumns != 0);
-      return NumRows != 0;
-    }
-
-    unsigned getStride() const {
-      if (IsColumnMajor)
-        return NumRows;
-      return NumColumns;
-    }
-
-    unsigned getNumVectors() const {
-      if (IsColumnMajor)
-        return NumColumns;
-      return NumRows;
-    }
-
-    /// Returns the transposed shape.
-    ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }
-  };
-
   /// Maps instructions to their shape information. The shape information
   /// describes the shape to be used while lowering. This matches the shape of
   /// the result value of the instruction, with the only exceptions being store
@@ -554,25 +615,6 @@ public:
     return true;
   }
 
-  bool isUniformShape(Value *V) {
-    Instruction *I = dyn_cast<Instruction>(V);
-    if (!I)
-      return true;
-
-    switch (I->getOpcode()) {
-    case Instruction::FAdd:
-    case Instruction::FSub:
-    case Instruction::FMul: // Scalar multiply.
-    case Instruction::FNeg:
-    case Instruction::Add:
-    case Instruction::Mul:
-    case Instruction::Sub:
-      return true;
-    default:
-      return false;
-    }
-  }
-
   /// Returns true if shape information can be used for \p V. The supported
   /// instructions must match the instructions that can be lowered by this pass.
   bool supportsShapeInfo(Value *V) {
@@ -610,43 +652,8 @@ public:
 
       // New entry, set the value and insert operands
       bool Propagate = false;
-
-      Value *MatrixA;
-      Value *MatrixB;
-      Value *M;
-      Value *N;
-      Value *K;
-      if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
-                          m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
-                          m_Value(N), m_Value(K)))) {
-        Propagate = setShapeInfo(Inst, {M, K});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
-                                 m_Value(MatrixA), m_Value(M), m_Value(N)))) {
-        // Flip dimensions.
-        Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
-                                 m_Value(MatrixA), m_Value(), m_Value(),
-                                 m_Value(), m_Value(M), m_Value(N)))) {
-        Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
-                                 m_Value(), m_Value(), m_Value(), m_Value(M),
-                                 m_Value(N)))) {
-        Propagate = setShapeInfo(Inst, {M, N});
-      } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
-        auto OpShape = ShapeMap.find(MatrixA);
-        if (OpShape != ShapeMap.end())
-          setShapeInfo(Inst, OpShape->second);
-        continue;
-      } else if (isUniformShape(Inst)) {
-        // Find the first operand that has a known shape and use that.
-        for (auto &Op : Inst->operands()) {
-          auto OpShape = ShapeMap.find(Op.get());
-          if (OpShape != ShapeMap.end()) {
-            Propagate |= setShapeInfo(Inst, OpShape->second);
-            break;
-          }
-        }
-      }
+      if (auto SI = computeShapeInfoForInst(Inst, ShapeMap))
+        Propagate = setShapeInfo(Inst, *SI);
 
       if (Propagate) {
         NewWorkList.push_back(Inst);
@@ -891,20 +898,28 @@ public:
       updateShapeAndReplaceAllUsesWith(I, NewInst);
       CleanupBinOp(I, A, B);
     }
-    // A^t + B ^t -> (A + B)^t
+    // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If
+    // the shape of the second transpose is different, there's a shape conflict
+    // which gets resolved by picking the shape of the first operand.
     else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) &&
              match(A, m_Intrinsic<Intrinsic::matrix_transpose>(
                           m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) &&
              match(B, m_Intrinsic<Intrinsic::matrix_transpose>(
-                          m_Value(BT), m_ConstantInt(R), m_ConstantInt(C)))) {
+                          m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {
       IRBuilder<> Builder(&I);
-      Value *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
-      setShapeInfo(Add, {C, R});
+      auto *Add = cast<Instruction>(Builder.CreateFAdd(AT, BT, "mfadd"));
+      setShapeInfo(Add, {R, C});
       MatrixBuilder MBuilder(Builder);
       Instruction *NewInst = MBuilder.CreateMatrixTranspose(
-          Add, C->getZExtValue(), R->getZExtValue(), "mfadd_t");
+          Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");
       updateShapeAndReplaceAllUsesWith(I, NewInst);
+      assert(computeShapeInfoForInst(NewInst, ShapeMap) ==
+                 computeShapeInfoForInst(&I, ShapeMap) &&
+             "Shape of new instruction doesn't match original shape.");
       CleanupBinOp(I, A, B);
+      assert(computeShapeInfoForInst(Add, ShapeMap).value_or(ShapeMap[Add]) ==
+                 ShapeMap[Add] &&
+             "Shape of updated addition doesn't match cached shape.");
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index bdbaf4f55c96..138dc38b5c14 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2956,7 +2956,12 @@ private:
       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
-      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+      BasicBlock::iterator LIIt = std::next(LI.getIterator());
+      // Ensure the insertion point comes before any debug-info immediately
+      // after the load, so that variable values referring to the load are
+      // dominated by it.
+      LIIt.setHeadBit(true);
+      IRB.SetInsertPoint(LI.getParent(), LIIt);
       // Create a placeholder value with the same type as LI to use as the
       // basis for the new value. This allows us to replace the uses of LI with
       // the computed value, and then replace the placeholder with LI, leaving
@@ -3937,30 +3942,62 @@ private:
     return false;
   }
 
-  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+  // Fold gep (select cond, ptr1, ptr2), idx
+  //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
+  // and  gep ptr, (select cond, idx1, idx2)
+  //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
   bool foldGEPSelect(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
+    // Check whether the GEP has exactly one select operand and all indices
+    // will become constant after the transform.
+    SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
+    for (Value *Op : GEPI.indices()) {
+      if (auto *SI = dyn_cast<SelectInst>(Op)) {
+        if (Sel)
+          return false;
+
+        Sel = SI;
+        if (!isa<ConstantInt>(Sel->getTrueValue()) ||
+            !isa<ConstantInt>(Sel->getFalseValue()))
+          return false;
+        continue;
+      }
 
-    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+      if (!isa<ConstantInt>(Op))
+        return false;
+    }
+
+    if (!Sel)
+      return false;
 
     LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
                       << "\n    original: " << *Sel
                       << "\n              " << GEPI);
 
+    auto GetNewOps = [&](Value *SelOp) {
+      SmallVector<Value *> NewOps;
+      for (Value *Op : GEPI.operands())
+        if (Op == Sel)
+          NewOps.push_back(SelOp);
+        else
+          NewOps.push_back(Op);
+      return NewOps;
+    };
+
+    Value *True = Sel->getTrueValue();
+    Value *False = Sel->getFalseValue();
+    SmallVector<Value *> TrueOps = GetNewOps(True);
+    SmallVector<Value *> FalseOps = GetNewOps(False);
+
     IRB.SetInsertPoint(&GEPI);
-    SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
 
     Type *Ty = GEPI.getSourceElementType();
-    Value *True = Sel->getTrueValue();
-    Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep",
-                                 IsInBounds);
-
-    Value *False = Sel->getFalseValue();
+    Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
+                                 True->getName() + ".sroa.gep", IsInBounds);
 
-    Value *NFalse = IRB.CreateGEP(Ty, False, Index,
-                                  False->getName() + ".sroa.gep", IsInBounds);
+    Value *NFalse =
+        IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
+                      False->getName() + ".sroa.gep", IsInBounds);
 
     Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
                                    Sel->getName() + ".sroa.sel");
@@ -4034,8 +4071,7 @@ private:
   }
 
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (isa<SelectInst>(GEPI.getPointerOperand()) &&
-        foldGEPSelect(GEPI))
+    if (foldGEPSelect(GEPI))
       return true;
 
     if (isa<PHINode>(GEPI.getPointerOperand()) &&
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index c75de8687879..88934a34d2e2 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
 
@@ -155,6 +156,26 @@ void llvm::createMemCpyLoopKnownSize(
          "Bytes copied should match size in the call!");
 }
 
+// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
+static Value *getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B,
+                                  Value *Len, Value *OpSize,
+                                  unsigned OpSizeVal) {
+  // For powers of 2, we can lshr by log2 instead of using udiv.
+  if (isPowerOf2_32(OpSizeVal))
+    return B.CreateLShr(Len, Log2_32(OpSizeVal));
+  return B.CreateUDiv(Len, OpSize);
+}
+
+// \returns \p Len urem \p OpSize, checking for optimization opportunities.
+static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
+                                      Value *Len, Value *OpSize,
+                                      unsigned OpSizeVal) {
+  // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
+  if (isPowerOf2_32(OpSizeVal))
+    return B.CreateAnd(Len, OpSizeVal - 1);
+  return B.CreateURem(Len, OpSize);
+}
+
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -194,9 +215,11 @@ void llvm::createMemCpyLoopUnknownSize(
   Type *Int8Type = Type::getInt8Ty(Ctx);
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
-  Value *RuntimeLoopCount = LoopOpIsInt8 ?
-                            CopyLen :
-                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeLoopCount = LoopOpIsInt8
+                                ? CopyLen
+                                : getRuntimeLoopCount(DL, PLBuilder, CopyLen,
+                                                      CILoopOpSize, LoopOpSize);
+
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
@@ -239,8 +262,8 @@ void llvm::createMemCpyLoopUnknownSize(
     assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
            "Store size is expected to match type size");
 
-    // Add in the
-    Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+    Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                     CILoopOpSize, LoopOpSize);
     Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
 
     // Loop body for the residual copy.
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 648a5276a3d6..1ffa003db7d5 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -110,6 +110,24 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 }
 
 void StackInfoBuilder::visit(Instruction &Inst) {
+  // Visit non-intrinsic debug-info records attached to Inst.
+  for (auto &DPV : Inst.getDbgValueRange()) {
+    auto AddIfInteresting = [&](Value *V) {
+      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
+        if (!isInterestingAlloca(*AI))
+          return;
+        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
+        auto &DPVVec = AInfo.DbgVariableRecords;
+        if (DPVVec.empty() || DPVVec.back() != &DPV)
+          DPVVec.push_back(&DPV);
+      }
+    };
+
+    for_each(DPV.location_ops(), AddIfInteresting);
+    if (DPV.isDbgAssign())
+      AddIfInteresting(DPV.getAddress());
+  }
+
   if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
     if (CI->canReturnTwice()) {
       Info.CallsReturnTwice = true;
@@ -138,30 +156,19 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    for (Value *V : DVI->location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    }
-  }
-
-  // Check for non-intrinsic debug-info records.
-  for (auto &DPV : Inst.getDbgValueRange()) {
-    for (Value *V : DPV.location_ops()) {
-      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
-        if (!isInterestingAlloca(*AI))
-          continue;
-        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DPVVec = AInfo.DbgVariableRecords;
-        if (DPVVec.empty() || DPVVec.back() != &DPV)
-          DPVVec.push_back(&DPV);
-      }
-    }
+    };
+    for_each(DVI->location_ops(), AddIfInteresting);
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+      AddIfInteresting(DAI->getAddress());
   }
 
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 1b142f14d811..5aa6df49e7f6 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1985,7 +1985,28 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
       // increment to the new (widened) increment.
       auto *OrigInc =
           cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
+
       WideInc->setDebugLoc(OrigInc->getDebugLoc());
+      // We are replacing a narrow IV increment with a wider IV increment. If
+      // the original (narrow) increment did not wrap, the wider increment one
+      // should not wrap either. Set the flags to be the union of both wide
+      // increment and original increment; this ensures we preserve flags SCEV
+      // could infer for the wider increment. Limit this only to cases where
+      // both increments directly increment the corresponding PHI nodes and have
+      // the same opcode. It is not safe to re-use the flags from the original
+      // increment, if it is more complex and SCEV expansion may have yielded a
+      // more simplified wider increment.
+      bool MatchingOps =
+          match(OrigInc, m_c_BinOp(m_Specific(OrigPhi), m_Value())) &&
+          match(WideInc, m_c_BinOp(m_Specific(WidePhi), m_Value())) &&
+          OrigInc->getOpcode() == WideInc->getOpcode();
+      if (MatchingOps && isa<OverflowingBinaryOperator>(OrigInc) &&
+          isa<OverflowingBinaryOperator>(WideInc)) {
+        WideInc->setHasNoUnsignedWrap(WideInc->hasNoUnsignedWrap() ||
+                                      OrigInc->hasNoUnsignedWrap());
+        WideInc->setHasNoSignedWrap(WideInc->hasNoSignedWrap() ||
+                                    OrigInc->hasNoSignedWrap());
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index a8ae3ee3b251..93a4c829df06 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -552,6 +552,7 @@ void Mapper::remapDPValue(DPValue &V) {
       V.setKillAddress();
     else if (NewAddr)
       V.setAddress(NewAddr);
+    V.setAssignId(cast<DIAssignID>(mapMetadata(V.getAssignID())));
   }
 
   // Find Value operands and remap those.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0b7298f7800..c54d065cac63 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -175,6 +175,15 @@ static cl::opt<int> RootLookAheadMaxDepth(
     "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for searching best rooting option"));
 
+static cl::opt<unsigned> MinProfitableStridedLoads(
+    "slp-min-strided-loads", cl::init(2), cl::Hidden,
+    cl::desc("The minimum number of loads, which should be considered strided, "
+             "if the stride is > 1 or is runtime value"));
+
+static cl::opt<unsigned> MaxProfitableLoadStride(
+    "slp-max-stride", cl::init(8), cl::Hidden,
+    cl::desc("The maximum stride, considered to be profitable."));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2575,7 +2584,7 @@ private:
     enum EntryState {
       Vectorize,
       ScatterVectorize,
-      PossibleStridedVectorize,
+      StridedVectorize,
       NeedToGather
     };
     EntryState State;
@@ -2753,8 +2762,8 @@ private:
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
         break;
-      case PossibleStridedVectorize:
-        dbgs() << "PossibleStridedVectorize\n";
+      case StridedVectorize:
+        dbgs() << "StridedVectorize\n";
         break;
       case NeedToGather:
         dbgs() << "NeedToGather\n";
@@ -3680,7 +3689,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
     if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     if (Entry->State == TreeEntry::ScatterVectorize ||
-        Entry->State == TreeEntry::PossibleStridedVectorize)
+        Entry->State == TreeEntry::StridedVectorize)
       return "color=blue";
     return "";
   }
@@ -3842,12 +3851,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 
 namespace {
 /// Tracks the state we can represent the loads in the given sequence.
-enum class LoadsState {
-  Gather,
-  Vectorize,
-  ScatterVectorize,
-  PossibleStridedVectorize
-};
+enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
 } // anonymous namespace
 
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
@@ -3878,6 +3882,14 @@ static Align computeCommonAlignment(ArrayRef<Value *> VL) {
   return CommonAlignment;
 }
 
+/// Check if \p Order represents reverse order.
+static bool isReverseOrder(ArrayRef<unsigned> Order) {
+  unsigned Sz = Order.size();
+  return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
+    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
+  });
+}
+
 /// Checks if the given array of loads can be represented as a vectorized,
 /// scatter or just simple gather.
 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
@@ -3900,7 +3912,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   // Make sure all loads in the bundle are simple - we can't vectorize
   // atomic or volatile loads.
   PointerOps.clear();
-  PointerOps.resize(VL.size());
+  const unsigned Sz = VL.size();
+  PointerOps.resize(Sz);
   auto *POIter = PointerOps.begin();
   for (Value *V : VL) {
     auto *L = cast<LoadInst>(V);
@@ -3911,12 +3924,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   }
 
   Order.clear();
+  auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
   if (IsSorted || all_of(PointerOps, [&](Value *P) {
         return arePointersCompatible(P, PointerOps.front(), TLI);
       })) {
-    bool IsPossibleStrided = false;
     if (IsSorted) {
       Value *Ptr0;
       Value *PtrN;
@@ -3930,30 +3943,71 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       std::optional<int> Diff =
           getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
       // Check that the sorted loads are consecutive.
-      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+      if (static_cast<unsigned>(*Diff) == Sz - 1)
         return LoadsState::Vectorize;
       // Simple check if not a strided access - clear order.
-      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
+      bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
+      // Try to generate strided load node if:
+      // 1. Target with strided load support is detected.
+      // 2. The number of loads is greater than MinProfitableStridedLoads,
+      // or the potential stride <= MaxProfitableLoadStride and the
+      // potential stride is power-of-2 (to avoid perf regressions for the very
+      // small number of loads) and max distance > number of loads, or potential
+      // stride is -1.
+      // 3. The loads are ordered, or number of unordered loads <=
+      // MaxProfitableUnorderedLoads, or loads are in reversed order.
+      // (this check is to avoid extra costs for very expensive shuffles).
+      if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+                                  (static_cast<unsigned>(std::abs(*Diff)) <=
+                                       MaxProfitableLoadStride * Sz &&
+                                   isPowerOf2_32(std::abs(*Diff)))) &&
+                                 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
+                                *Diff == -(static_cast<int>(Sz) - 1))) {
+        int Stride = *Diff / static_cast<int>(Sz - 1);
+        if (*Diff == Stride * static_cast<int>(Sz - 1)) {
+          Align Alignment =
+              cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+                  ->getAlign();
+          if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
+            // Iterate through all pointers and check if all distances are
+            // unique multiple of Dist.
+            SmallSet<int, 4> Dists;
+            for (Value *Ptr : PointerOps) {
+              int Dist = 0;
+              if (Ptr == PtrN)
+                Dist = *Diff;
+              else if (Ptr != Ptr0)
+                Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+              // If the strides are not the same or repeated, we can't
+              // vectorize.
+              if (((Dist / Stride) * Stride) != Dist ||
+                  !Dists.insert(Dist).second)
+                break;
+            }
+            if (Dists.size() == Sz)
+              return LoadsState::StridedVectorize;
+          }
+        }
+      }
     }
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
     // increases the cost.
     Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
-          return L && L->isLoopInvariant(V);
-        })) <= VL.size() / 2 && VL.size() > 2;
+        static_cast<unsigned>(count_if(
+            PointerOps,
+            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
+        Sz > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
                  (GEP && GEP->getNumOperands() == 2);
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
-      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
           !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
-        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
-                                 : LoadsState::ScatterVectorize;
+        return LoadsState::ScatterVectorize;
     }
   }
 
@@ -4160,7 +4214,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     return std::move(ResOrder);
   }
   if ((TE.State == TreeEntry::Vectorize ||
-       TE.State == TreeEntry::PossibleStridedVectorize) &&
+       TE.State == TreeEntry::StridedVectorize) &&
       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
       !TE.isAltShuffle())
@@ -4418,7 +4472,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -4442,9 +4496,6 @@ void BoUpSLP::reorderTopToBottom() {
     MapVector<OrdersType, unsigned,
               DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
         OrdersUses;
-    // Last chance orders - scatter vectorize. Try to use their orders if no
-    // other orders or the order is counted already.
-    SmallVector<OrdersType> StridedVectorizeOrders;
     SmallPtrSet<const TreeEntry *, 4> VisitedOps;
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
@@ -4491,11 +4542,6 @@ void BoUpSLP::reorderTopToBottom() {
         if (Order.empty())
           continue;
       }
-      // Postpone scatter orders.
-      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-        StridedVectorizeOrders.push_back(Order);
-        continue;
-      }
       // Stores actually store the mask, not the order, need to invert.
       if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
           OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4512,22 +4558,8 @@ void BoUpSLP::reorderTopToBottom() {
         ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
       }
     }
-    // Set order of the user node.
-    if (OrdersUses.empty()) {
-      if (StridedVectorizeOrders.empty())
-        continue;
-      // Add (potentially!) strided vectorize orders.
-      for (OrdersType &Order : StridedVectorizeOrders)
-        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
-    } else {
-      // Account (potentially!) strided vectorize orders only if it was used
-      // already.
-      for (OrdersType &Order : StridedVectorizeOrders) {
-        auto *It = OrdersUses.find(Order);
-        if (It != OrdersUses.end())
-          ++It->second;
-      }
-    }
+    if (OrdersUses.empty())
+      continue;
     // Choose the most used order.
     ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
     unsigned Cnt = OrdersUses.front().second;
@@ -4569,7 +4601,7 @@ void BoUpSLP::reorderTopToBottom() {
         continue;
       }
       if ((TE->State == TreeEntry::Vectorize ||
-           TE->State == TreeEntry::PossibleStridedVectorize) &&
+           TE->State == TreeEntry::StridedVectorize) &&
           isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
               InsertElementInst>(TE->getMainOp()) &&
           !TE->isAltShuffle()) {
@@ -4610,10 +4642,6 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
-      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
-      // require reordering of the operands, which is not implemented yet.
-      if (TE->State == TreeEntry::PossibleStridedVectorize)
-        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -4664,13 +4692,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::PossibleStridedVectorize)
+        TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
     }
@@ -4688,7 +4716,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     SmallVector<TreeEntry *> Filtered;
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::PossibleStridedVectorize ||
+            TE->State == TreeEntry::StridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
              GathersToOrders.count(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -4733,9 +4761,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       MapVector<OrdersType, unsigned,
                 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
           OrdersUses;
-      // Last chance orders - scatter vectorize. Try to use their orders if no
-      // other orders or the order is counted already.
-      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
       // Do the analysis for each tree entry only once, otherwise the order of
       // the same node my be considered several times, though might be not
       // profitable.
@@ -4757,11 +4782,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
             });
-        // Postpone scatter orders.
-        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
-          StridedVectorizeOrders.emplace_back(Order, NumOps);
-          continue;
-        }
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4819,30 +4839,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
-      // If no orders - skip current nodes and jump to the next one, if any.
-      if (OrdersUses.empty()) {
-        if (StridedVectorizeOrders.empty() ||
-            (Data.first->ReorderIndices.empty() &&
-             Data.first->ReuseShuffleIndices.empty() &&
-             !(IgnoreReorder &&
-               Data.first == VectorizableTree.front().get()))) {
-          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
-            OrderedEntries.remove(Op.second);
-          continue;
-        }
-        // Add (potentially!) strided vectorize orders.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
-          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
-              Pair.second;
-      } else {
-        // Account (potentially!) strided vectorize orders only if it was used
-        // already.
-        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
-          auto *It = OrdersUses.find(Pair.first);
-          if (It != OrdersUses.end())
-            It->second += Pair.second;
-        }
-      }
       // Choose the best order.
       ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
       unsigned Cnt = OrdersUses.front().second;
@@ -4878,7 +4874,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         }
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
-            TE->State != TreeEntry::PossibleStridedVectorize &&
+            TE->State != TreeEntry::StridedVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -4910,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         Data.first->reorderOperands(Mask);
       if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
           Data.first->isAltShuffle() ||
-          Data.first->State == TreeEntry::PossibleStridedVectorize) {
+          Data.first->State == TreeEntry::StridedVectorize) {
         reorderScalars(Data.first->Scalars, Mask);
         reorderOrder(Data.first->ReorderIndices, MaskOrder,
                      /*BottomOrder=*/true);
@@ -4973,7 +4969,6 @@ void BoUpSLP::buildExternalUses(
           // instructions. If that is the case, the one in FoundLane will
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
-              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
               !doesInTreeUserNeedToExtract(
                   Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
@@ -5331,8 +5326,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
       return TreeEntry::ScatterVectorize;
-    case LoadsState::PossibleStridedVectorize:
-      return TreeEntry::PossibleStridedVectorize;
+    case LoadsState::StridedVectorize:
+      return TreeEntry::StridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
       Type *ScalarTy = VL0->getType();
@@ -5753,8 +5748,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   bool AreAllSameInsts =
       (S.getOpcode() && allSameBlock(VL)) ||
       (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
@@ -5851,8 +5845,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Special processing for sorted pointers for ScatterVectorize node with
   // constant indeces only.
   if (AreAllSameInsts && UserTreeIdx.UserTE &&
-      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
-       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
       !(S.getOpcode() && allSameBlock(VL))) {
     assert(S.OpValue->getType()->isPointerTy() &&
            count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
@@ -6049,18 +6042,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         }
         TE->setOperandsInOrder();
         break;
-      case TreeEntry::PossibleStridedVectorize:
+      case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies);
         } else {
-          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
+          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                             UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
         }
         TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
@@ -7091,7 +7083,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         !isSplat(Gathers)) {
       InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
       SetVector<Value *> VectorizedLoads;
-      SmallVector<unsigned> VectorizedStarts;
+      SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
       unsigned StartIdx = 0;
       unsigned VF = VL.size() / 2;
@@ -7115,12 +7107,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             switch (LS) {
             case LoadsState::Vectorize:
             case LoadsState::ScatterVectorize:
-            case LoadsState::PossibleStridedVectorize:
+            case LoadsState::StridedVectorize:
               // Mark the vectorized loads so that we don't vectorize them
               // again.
               // TODO: better handling of loads with reorders.
-              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
-                VectorizedStarts.push_back(Cnt);
+              if (((LS == LoadsState::Vectorize ||
+                    LS == LoadsState::StridedVectorize) &&
+                   CurrentOrder.empty()) ||
+                  (LS == LoadsState::StridedVectorize &&
+                   isReverseOrder(CurrentOrder)))
+                VectorizedStarts.emplace_back(Cnt, LS);
               else
                 ScatterVectorized.push_back(Cnt);
               VectorizedLoads.insert(Slice.begin(), Slice.end());
@@ -7164,16 +7160,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
-        for (unsigned P : VectorizedStarts) {
-          auto *LI = cast<LoadInst>(VL[P]);
+        for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
+          auto *LI = cast<LoadInst>(VL[P.first]);
           Align Alignment = LI->getAlign();
           GatherCost +=
-              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                  LI->getPointerAddressSpace(), CostKind,
-                                  TTI::OperandValueInfo(), LI);
+              P.second == LoadsState::Vectorize
+                  ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                        LI->getPointerAddressSpace(), CostKind,
+                                        TTI::OperandValueInfo(), LI)
+                  : TTI.getStridedMemoryOpCost(
+                        Instruction::Load, LoadTy, LI->getPointerOperand(),
+                        /*VariableMask=*/false, Alignment, CostKind, LI);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
-          for (auto [I, V] : enumerate(VL.slice(P, VF)))
+          for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
             PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
           auto [ScalarGEPCost, VectorGEPCost] =
               getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
@@ -7913,8 +7913,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
-      E->State != TreeEntry::PossibleStridedVectorize) {
+      (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
     SmallVector<int> NewMask;
     if (E->getOpcode() == Instruction::Store) {
       // For stores the order is actually a mask.
@@ -7932,7 +7933,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -7952,7 +7953,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   auto GetCastContextHint = [&](Value *V) {
     if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize)
+      if (OpTE->State == TreeEntry::ScatterVectorize ||
+          OpTE->State == TreeEntry::StridedVectorize)
         return TTI::CastContextHint::GatherScatter;
       if (OpTE->State == TreeEntry::Vectorize &&
           OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
@@ -8028,8 +8030,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Calculate cost difference from vectorizing set of GEPs.
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
-    assert(E->State == TreeEntry::Vectorize &&
-           "Entry state expected to be Vectorize here.");
+    assert((E->State == TreeEntry::Vectorize ||
+            E->State == TreeEntry::StridedVectorize) &&
+           "Entry state expected to be Vectorize or StridedVectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
@@ -8382,10 +8385,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
+        VecLdCost = TTI->getStridedMemoryOpCost(
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unknown EntryState");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getGatherScatterOpCost(
@@ -8398,8 +8405,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
     // If this node generates masked gather load then it is not a terminal node.
     // Hence address operand cost is estimated separately.
-    if (E->State == TreeEntry::ScatterVectorize ||
-        E->State == TreeEntry::PossibleStridedVectorize)
+    if (E->State == TreeEntry::ScatterVectorize)
       return Cost;
 
     // Estimate cost of GEPs since this tree node is a terminator.
@@ -8608,7 +8614,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
        VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
-       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
+       VectorizableTree[0]->State != TreeEntry::StridedVectorize))
     return false;
 
   return true;
@@ -10579,11 +10585,6 @@ public:
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                  bool PostponedPHIs) {
   ValueList &VL = E->getOperand(NodeIdx);
-  if (E->State == TreeEntry::PossibleStridedVectorize &&
-      !E->ReorderIndices.empty()) {
-    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
-    reorderScalars(VL, Mask);
-  }
   const unsigned VF = VL.size();
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
@@ -10647,19 +10648,13 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
           // ... (use %2)
           // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
           // br %block
-          SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);
-          SmallSet<int, 4> UsedIdxs;
-          int Pos = 0;
-          for (int Idx : VE->ReuseShuffleIndices) {
-            if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&
-                UsedIdxs.insert(Idx).second)
-              UniqueIdxs[Idx] = Pos;
-            ++Pos;
+          SmallVector<int> Mask(VF, PoisonMaskElem);
+          for (auto [I, V] : enumerate(VL)) {
+            if (isa<PoisonValue>(V))
+              continue;
+            Mask[I] = VE->findLaneForValue(V);
           }
-          assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
-                                          "less than original vector size.");
-          UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);
-          V = FinalShuffle(V, UniqueIdxs);
+          V = FinalShuffle(V, Mask);
         } else {
           assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
                  "Expected vectorization factor less "
@@ -11157,6 +11152,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return Vec;
   }
 
+  bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
                           bool IsSigned) {
     if (V->getType() != VecTy)
@@ -11167,7 +11163,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
       ShuffleBuilder.add(V, Mask);
-    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
+    } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
       ShuffleBuilder.addOrdered(V, std::nullopt);
     } else {
       ShuffleBuilder.addOrdered(V, E->ReorderIndices);
@@ -11177,7 +11173,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::PossibleStridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize) &&
          "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -11642,10 +11638,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+      } else if (E->State == TreeEntry::StridedVectorize) {
+        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+        PO = IsReverseOrder ? PtrN : Ptr0;
+        std::optional<int> Diff = getPointersDiff(
+            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
+        Type *StrideTy = DL->getIndexType(PO->getType());
+        int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
+        Value *StrideVal =
+            ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
+                                           DL->getTypeAllocSize(ScalarTy));
+        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_load,
+            {VecTy, PO->getType(), StrideTy},
+            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/0,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        NewLI = Inst;
       } else {
-        assert((E->State == TreeEntry::ScatterVectorize ||
-                E->State == TreeEntry::PossibleStridedVectorize) &&
-               "Unhandled state");
+        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
         Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -12069,8 +12084,11 @@ Value *BoUpSLP::vectorizeTree(
                      [&](llvm::User *U) {
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
-                              UseEntry->State == TreeEntry::Vectorize &&
-                              E->State == TreeEntry::Vectorize &&
+                              (UseEntry->State == TreeEntry::Vectorize ||
+                               UseEntry->State ==
+                                   TreeEntry::StridedVectorize) &&
+                              (E->State == TreeEntry::Vectorize ||
+                               E->State == TreeEntry::StridedVectorize) &&
                               doesInTreeUserNeedToExtract(
                                   Scalar,
                                   cast<Instruction>(UseEntry->Scalars.front()),
diff --git a/llvm/test/Analysis/BasicAA/vscale.ll b/llvm/test/Analysis/BasicAA/vscale.ll
index 3fff435463a6..05b9b6b3c3a9 100644
--- a/llvm/test/Analysis/BasicAA/vscale.ll
+++ b/llvm/test/Analysis/BasicAA/vscale.ll
@@ -309,6 +309,203 @@ define void @v1v2types(ptr %p) vscale_range(1,16) {
   ret void
 }
 
+; VScale intrinsic offset tests
+
+; CHECK-LABEL: vscale_neg_notscalable
+; CHECK-DAG:   NoAlias:     <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %p, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %vm16, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16pv16, <4 x i32>* %p
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16pv16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:     <4 x i32>* %m16, <4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:    <4 x i32>* %m16pv16, <4 x i32>* %vm16m16
+define void @vscale_neg_notscalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vp
+  load <4 x i32>, ptr %p
+  load <4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %m16
+  load <4 x i32>, ptr %vm16m16
+  load <4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_neg_scalable
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
+define void @vscale_neg_scalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 -1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vp
+  load <vscale x 4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vm16m16
+  load <vscale x 4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_pos_notscalable
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %p, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %vm16, <4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16pv16, <4 x i32>* %p
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16pv16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16pv16, <4 x i32>* %vm16m16
+define void @vscale_pos_notscalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vp
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vm
+  load <4 x i32>, ptr %p
+  load <4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %m16
+  load <4 x i32>, ptr %vm16m16
+  load <4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_pos_scalable
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16m16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %m16pv16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16pv16, <vscale x 4 x i32>* %vm16m16
+define void @vscale_pos_scalable(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vm16 = getelementptr i8, ptr %p, i64 %vp
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 1
+  %vm16m16 = getelementptr <4 x i32>, ptr %vm16, i64 1
+  %m16pv16 = getelementptr i8, ptr %m16, i64 %vm
+  load <vscale x 4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vm16m16
+  load <vscale x 4 x i32>, ptr %m16pv16
+  ret void
+}
+
+; CHECK-LABEL: vscale_v1v2types
+; CHECK-DAG:   MustAlias:    <4 x i32>* %p, <vscale x 4 x i32>* %p
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <4 x i32>* %vm16
+; CHECK-DAG:   MustAlias:    <4 x i32>* %vm16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <vscale x 4 x i32>* %p
+; CHECK-DAG:   NoAlias:      <4 x i32>* %m16, <4 x i32>* %p
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <vscale x 4 x i32>* %vm16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <4 x i32>* %vm16
+; CHECK-DAG:   MustAlias:    <4 x i32>* %m16, <vscale x 4 x i32>* %m16
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   NoAlias:      <4 x i32>* %p, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %vm16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <vscale x 4 x i32>* %m16, <vscale x 4 x i32>* %vp16
+; CHECK-DAG:   MayAlias:     <4 x i32>* %m16, <vscale x 4 x i32>* %vp16
+define void @vscale_v1v2types(ptr %p) {
+  %v = call i64 @llvm.vscale.i64()
+  %vp = mul nsw i64 %v, 16
+  %vm = mul nsw i64 %v, -16
+  %vp16 = getelementptr i8, ptr %p, i64 %vp
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  %m16 = getelementptr <4 x i32>, ptr %p, i64 -1
+  load <vscale x 4 x i32>, ptr %p
+  load <4 x i32>, ptr %p
+  load <vscale x 4 x i32>, ptr %vm16
+  load <4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %m16
+  load <4 x i32>, ptr %m16
+  load <vscale x 4 x i32>, ptr %vp16
+  ret void
+}
+
+; CHECK-LABEL: vscale_negativescale
+; CHECK-DAG:   MayAlias:    <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %vm16
+define void @vscale_negativescale(ptr %p) vscale_range(1,16) {
+  %v = call i64 @llvm.vscale.i64()
+  %vm = mul nsw i64 %v, -15
+  %vm16 = getelementptr i8, ptr %p, i64 %vm
+  load <vscale x 4 x i32>, ptr %vm16
+  load <vscale x 4 x i32>, ptr %p
+  ret void
+}
+
+; CHECK-LABEL: onevscale
+; CHECK-DAG:   MustAlias:    <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+define void @onevscale(ptr %p) vscale_range(1,16) {
+  %v1 = call i64 @llvm.vscale.i64()
+  %vp1 = mul nsw i64 %v1, 16
+  %vp2 = mul nsw i64 %v1, 16
+  %vp3 = mul nsw i64 %v1, 17
+  %vp161 = getelementptr i8, ptr %p, i64 %vp1
+  %vp162 = getelementptr i8, ptr %p, i64 %vp2
+  %vp161b = getelementptr i8, ptr %vp161, i64 %vp3
+  load <vscale x 4 x i32>, ptr %vp161
+  load <vscale x 4 x i32>, ptr %vp162
+  load <vscale x 4 x i32>, ptr %vp161b
+  ret void
+}
+
+; CHECK-LABEL: twovscales
+; CHECK-DAG:   MustAlias:    <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp162
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161, <vscale x 4 x i32>* %vp161b
+; CHECK-DAG:   NoAlias:      <vscale x 4 x i32>* %vp161b, <vscale x 4 x i32>* %vp162
+define void @twovscales(ptr %p) vscale_range(1,16) {
+  %v1 = call i64 @llvm.vscale.i64()
+  %v2 = call i64 @llvm.vscale.i64()
+  %vp1 = mul nsw i64 %v1, 16
+  %vp2 = mul nsw i64 %v2, 16
+  %vp3 = mul nsw i64 %v1, 17
+  %vp161 = getelementptr i8, ptr %p, i64 %vp1
+  %vp162 = getelementptr i8, ptr %p, i64 %vp2
+  %vp161b = getelementptr i8, ptr %vp161, i64 %vp3
+  load <vscale x 4 x i32>, ptr %vp161
+  load <vscale x 4 x i32>, ptr %vp162
+  load <vscale x 4 x i32>, ptr %vp161b
+  ret void
+}
+
 ; getelementptr recursion
 
 ; CHECK-LABEL: gep_recursion_level_1
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 5dd37e83f479..0cd444f84985 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
index a35242496848..e4e29143985b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-none-linux-gnueabi -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 737437282d9b..954a836e44f2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -59,19 +59,19 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -82,19 +82,19 @@ define void @fast_fp_reductions() {
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -105,19 +105,19 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
index dcca11887141..cf03026ceb75 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll
@@ -165,11 +165,11 @@ define void @reduce_fmin16() {
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmin16'
@@ -199,11 +199,11 @@ define void @reduce_fmax16() {
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef)
-; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
+; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef)
 ; CHECK-NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-F16-LABEL: 'reduce_fmax16'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
index 7e897f41ee45..9a86fbe105ec 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-codesize.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
index 94e26227539b..393dec82428b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
index 787a96f3bfde..63bb07bf4fd8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-sizelatency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 58e99c9ed652..b521a7594846 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for extract_subector style shuffles.
@@ -38,7 +38,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 poison>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -90,7 +90,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -102,7 +102,7 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -238,9 +238,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -262,9 +262,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -297,9 +297,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -321,9 +321,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -356,9 +356,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -380,9 +380,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -397,123 +397,300 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -533,9 +710,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,9 +734,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -592,9 +769,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -616,9 +793,9 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +886,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +910,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +1005,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +1029,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -947,9 +1124,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -971,9 +1148,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1050,243 +1227,600 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'test_vXi8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
@@ -1304,9 +1838,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1328,9 +1862,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -1423,9 +1957,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1447,9 +1981,9 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
index c268cc55880c..d4287612399b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; 3 reads and 3 writes should need 12 memchecks
 ; CHECK: function 'testf':
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
index 86395eea96f5..1496e1b0be82 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll
@@ -12,7 +12,7 @@
 ;   }
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; CHECK: function 'f':
 ; CHECK: (Low: (20000 + %a)<nuw> High: (60004 + %a))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 0e9c126e97a3..458c2cb76d9e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -2146,8 +2146,7 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr1_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -3202,8 +3201,7 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr2_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -4255,8 +4253,7 @@ define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mvn w0, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr4_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -5276,8 +5273,7 @@ define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
-; CHECK-OUTLINE-O1-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
-; CHECK-OUTLINE-O1-NEXT:    eor x0, x8, x1
+; CHECK-OUTLINE-O1-NEXT:    mvn x0, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr8_rel
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index 16b780a83973..661265173ae8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -529,3 +529,27 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+
+---
+name:            pr81244
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name: pr81244
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>)
+    ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s8>) = G_TRUNC %0(<2 x s32>)
+    %2:_(<4 x s8>) = G_CONCAT_VECTORS %1(<2 x s8>), %1(<2 x s8>)
+    %3:_(<4 x s16>) = G_ANYEXT %2(<4 x s8>)
+    $d0 = COPY %3(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index c90c31aa27ef..aaf2fef95459 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -163,6 +163,9 @@
 # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_READSTEADYCOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 
 # DEBUG-NEXT: G_LOAD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/Redundantstore.ll b/llvm/test/CodeGen/AArch64/Redundantstore.ll
index 6fec5573fdcb..229d6446c4e0 100644
--- a/llvm/test/CodeGen/AArch64/Redundantstore.ll
+++ b/llvm/test/CodeGen/AArch64/Redundantstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi | FileCheck %s 
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @end_of_array = common global ptr null, align 8
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index 122e187637c3..f2ed57ead074 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -6,8 +6,8 @@
 ; The following tests use the balance-fp-ops feature, and should be independent of
 ; the target cpu.
 
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mtriple=aarch64 -mattr=+balance-fp-ops  -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 5b78b0d47731..b77d59134765 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64-eabi -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
+; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Function Attrs: nounwind readnone
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
index c2a3acbedc8b..578038b8f4da 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -O3 -aarch64-enable-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s 2>&1 | FileCheck --check-prefix=CHECK-UseAA %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linux-gnueabi"
+target triple = "aarch64"
 
 ; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
 ; backend. If useAA() returns true, it will lower a GEP with multiple indices
diff --git a/llvm/test/CodeGen/AArch64/aarch64-tbz.ll b/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
index 28629a8c2f0d..4cf36337f9a2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-tbz.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnueabi < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnueabi -cgp-verify-bfi-updates=true < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 -cgp-verify-bfi-updates=true < %s | FileCheck %s
 
 ; CHECK-LABEL: test1
 ; CHECK: tbz {{w[0-9]}}, #3, {{.LBB0_3}}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll b/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
index af5f6a9d6924..285c16a0b354 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-unroll-and-jam.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='loop-unroll-and-jam' < %s -mcpu=cortex-a55 -mtriple=aarch64-none-linux-eabi -S | FileCheck %s
+; RUN: opt -passes='loop-unroll-and-jam' < %s -mcpu=cortex-a55 -mtriple=aarch64 -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
index 463084e6fe6a..50f7d6d7ce9d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-linux-eabi -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -o - | FileCheck %s
 
 %struct.a= type { i64, i64, i64, i64 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index 68c56d765cbb..e4fbf0765dcd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+fullfp16,+bf16 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16 | FileCheck %s
 
 ; Check that building a vector from floats doesn't insert an unnecessary
 ; copy for lane zero.
diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll
index 8ec98b744297..c9074c2adbe3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-movi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 ;==--------------------------------------------------------------------------==
 ; Tests for MOV-immediate implemented with ORR-immediate.
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 599fac8c8034..f5ce73a36612 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
+; RUN: llc < %s -mtriple=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
+; RUN: llc < %s -mtriple=aarch64 -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-LABEL: cnt32_advsimd:
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index 5f61d9019ab4..f548a0e01fee 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i32 @test_rev_w(i32 %a) nounwind {
 ; CHECK-LABEL: test_rev_w:
diff --git a/llvm/test/CodeGen/AArch64/asm-large-immediate.ll b/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
index 83690716a9e2..b45f22cc7073 100644
--- a/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -no-integrated-as | FileCheck %s
 
 define void @test() {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
index cf0b43807f4c..d59de3c56f4e 100644
--- a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
 
 %struct.float16x4x2_t = type { [2 x <4 x bfloat>] }
 %struct.float16x8x2_t = type { [2 x <8 x bfloat>] }
diff --git a/llvm/test/CodeGen/AArch64/bf16.ll b/llvm/test/CodeGen/AArch64/bf16.ll
index 7a171c6702e1..d3911ae4c033 100644
--- a/llvm/test/CodeGen/AArch64/bf16.ll
+++ b/llvm/test/CodeGen/AArch64/bf16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -asm-verbose=0 -mtriple=arm64-eabi -mattr=+bf16 | FileCheck %s
-; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-eabi -mattr=+bf16 | FileCheck %s
+; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64 -mattr=+bf16 | FileCheck %s
 
 ; test argument passing and simple load/store
 
diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll
index be9f5b82c6c0..a6d3683a92a8 100644
--- a/llvm/test/CodeGen/AArch64/bitreverse.ll
+++ b/llvm/test/CodeGen/AArch64/bitreverse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; These tests just check that the plumbing is in place for @llvm.bitreverse.
 
diff --git a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
index de543f4e4d94..833a6d5b1d1d 100644
--- a/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
+++ b/llvm/test/CodeGen/AArch64/branch-target-enforcement-indirect-calls.ll
@@ -26,3 +26,68 @@ entry:
 ; CHECK: br {{x16|x17}}
   ret void
 }
+define void @bti_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" {
+entry:
+  %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p)
+  tail call void %p_x10()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+
+; sign-return-address places no further restrictions on the tail-call register.
+
+define void @bti_enabled_pac_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" {
+entry:
+  tail call void %p()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+define void @bti_enabled_pac_enabled_force_x10(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" {
+entry:
+  %p_x10 = tail call ptr asm "", "={x10},{x10},~{lr}"(ptr %p)
+  tail call void %p_x10()
+; CHECK: br {{x16|x17}}
+  ret void
+}
+
+; PAuthLR needs to use x16 to hold the address of the signing instruction. That
+; can't be changed because the hint instruction only uses that register, so the
+; only choice for the tail-call function pointer is x17.
+
+define void @bti_enabled_pac_pc_enabled(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  tail call void %p()
+; CHECK: br x17
+  ret void
+}
+define void @bti_enabled_pac_pc_enabled_force_x16(ptr %p) "branch-target-enforcement"="true" "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p)
+  tail call void %p_x16()
+; CHECK: br x17
+  ret void
+}
+
+; PAuthLR by itself prevents x16 from being used, but any other
+; non-callee-saved register can be used.
+
+define void @pac_pc_enabled(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  tail call void %p()
+; CHECK: br {{(x[0-9]|x1[0-578])$}}
+  ret void
+}
+define void @pac_pc_enabled_force_x16(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x16 = tail call ptr asm "", "={x16},{x16},~{lr}"(ptr %p)
+  tail call void %p_x16()
+; CHECK: br {{(x[0-9]|x1[0-578])$}}
+  ret void
+}
+define void @pac_pc_enabled_force_x17(ptr %p) "sign-return-address"="all" "branch-protection-pauth-lr"="true" {
+entry:
+  %p_x17 = tail call ptr asm "", "={x17},{x17},~{lr}"(ptr %p)
+  tail call void %p_x17()
+; CHECK: br x17
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/cmpwithshort.ll b/llvm/test/CodeGen/AArch64/cmpwithshort.ll
index 8dbfdae5df90..3dbf64a2f6f1 100644
--- a/llvm/test/CodeGen/AArch64/cmpwithshort.ll
+++ b/llvm/test/CodeGen/AArch64/cmpwithshort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi | FileCheck %s 
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s 
 
 define i16 @test_1cmp_signed_1(ptr %ptr1) {
 ; CHECK-LABEL: @test_1cmp_signed_1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 6f4f8d3ca2d3..001046f8f397 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index b0a3e46c96c4..1b8a21b66ade 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 3118d8669dc9..1ce480bbf3d8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index 256ed10cad07..d88eef9800d7 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index d9ec5fcd3bdc..0b59be9414fa 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 2dec03b6f979..16e1f3e63dce 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to transform
 define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index e7ebd07fd731..b631486137e6 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; Expected to not transform as the type's minimum size is less than 128 bits.
 define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 0cbe2f46088e..2627f2a4fb5e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 ; a[i] * b[i] * (11.0 + 3.0.i);
 ;
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
index 8de2ac5a140c..ad9240b0922b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+complxnum -o - | FileCheck %s
 
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 
 ; a[i] * b[i] * (11.0 + 3.0.i);
diff --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
index dc00c41892ba..87a315a49f3e 100644
--- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
+++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64-eabi -verify-machineinstrs | FileCheck %s
+; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 ; CMN is an alias of ADDS.
 
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index d109dade67f5..031ee353c0e8 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-unknown-linuxeabi -consthoist-gep %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -consthoist-gep %s -o - | FileCheck %s
 
 %struct.blam = type { %struct.bar, %struct.bar.0, %struct.wobble, %struct.wombat, i8, i16, %struct.snork.2, %struct.foo, %struct.snork.3, %struct.wobble.4, %struct.quux, [9 x i16], %struct.spam, %struct.zot }
 %struct.bar = type { i8, i8, %struct.snork }
diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll
index b24866064efa..7b45d0f30bcd 100644
--- a/llvm/test/CodeGen/AArch64/cpus.ll
+++ b/llvm/test/CodeGen/AArch64/cpus.ll
@@ -37,6 +37,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
new file mode 100644
index 000000000000..d9d0d982fd9e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
@@ -0,0 +1,72 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+; RUN: llc --try-experimental-debuginfo-iterators -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
+;; and 'y' has an implicit location range as well as stack location range
+;; (according to the hand-modified debug info -- see the dbg.value).
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
+  call void @use(ptr null), !dbg !28
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
new file mode 100644
index 000000000000..155e6107158a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
@@ -0,0 +1,76 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+; RUN: llc --try-experimental-debuginfo-iterators -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Copied from dbg-value-tag-offset.ll. Check that variables with locations
+;; tracked with dbg.assigns with DW_OP_LLVM_TAG_offset operators in their
+;; expressions get a DW_AT_LLVM_tag_offset attribute on their DIE.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x00)
+; CHECK-NEXT:   DW_AT_name    ("x")
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 849fc7aa00a8..1f68c77611e1 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s
 
 define <4 x i16> @normal_load_v4i8(ptr %p) {
 ; CHECK-LABEL: normal_load_v4i8:
diff --git a/llvm/test/CodeGen/AArch64/extract-sext-zext.ll b/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
index f566ebb4f208..ecb76d9320a2 100644
--- a/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
+++ b/llvm/test/CodeGen/AArch64/extract-sext-zext.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
 
 define i64 @extract_v2i64(<2 x i64> %x, i32 %y) {
 ; CHECK-ISEL-LABEL: extract_v2i64:
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index c56c6a07357d..5462bc65fd34 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fabs_f64(double %a) {
 ; CHECK-LABEL: fabs_f64:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index c02e02d9257e..31389f5a77d6 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fadd_f64(double %a, double %b) {
 ; CHECK-LABEL: fadd_f64:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 29138ba59cfe..0f02784aaf32 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @f64_double(double %a, double %b, double %d, double %e) {
 ; CHECK-LABEL: f64_double:
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 4abd115da21c..a1c48bd943e2 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @copysign_f64(double %a, double %b) {
 ; CHECK-SD-LABEL: copysign_f64:
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index ce38bebf7635..584174d6d064 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @ceil_f64(double %a) {
 ; CHECK-LABEL: ceil_f64:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index b7a645bfb546..fa87c4fa2d16 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fdiv_f64(double %a, double %b) {
 ; CHECK-LABEL: fdiv_f64:
diff --git a/llvm/test/CodeGen/AArch64/fence-singlethread.ll b/llvm/test/CodeGen/AArch64/fence-singlethread.ll
index f36d28926acb..eb77daa25241 100644
--- a/llvm/test/CodeGen/AArch64/fence-singlethread.ll
+++ b/llvm/test/CodeGen/AArch64/fence-singlethread.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=LINUX
 ; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=IOS
 ; RUN: llc -mtriple=aarch64-apple-ios %s -o - -global-isel | FileCheck %s --check-prefix=IOS
-; RUN: llc -mtriple=aarch64-linux-gnueabihf %s -filetype=obj -o %t
+; RUN: llc -mtriple=aarch64 %s -filetype=obj -o %t
 ; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ
 
 ; OBJ-NOT: dmb
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 2848a6bde320..e3c0ced79f07 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @exp_f64(double %a) {
 ; CHECK-LABEL: exp_f64:
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 217e4e40a779..f0e946c13998 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
 
 define double @min_f64(double %a, double %b) {
 ; CHECK-LABEL: min_f64:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 1b92c462af14..cdf9973b49f4 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-NOFP16-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-FP16-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-NOFP16-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-FP16-GI
 
 define double @min_f64(double %a, double %b) {
 ; CHECK-LABEL: min_f64:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 3ae2158a1886..339ade5fc707 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fma_f64(double %a, double %b, double %c) {
 ; CHECK-LABEL: fma_f64:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 244c73bba2e9..fe84fe1f317a 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fmul_f64(double %a, double %b) {
 ; CHECK-LABEL: fmul_f64:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index fd427687c7a7..7805512cbf45 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @fabs_f64(double %a) {
 ; CHECK-LABEL: fabs_f64:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index f68691ac15c6..e9fbaf6f046d 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare half @llvm.aarch64.neon.fmulx.f16(half, half)
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
index 62284ca56126..40d2d636b94b 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
 declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
index 111ddfeab705..36795f86e006 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare half @llvm.aarch64.sisd.fabd.f16(half, half)
 declare half @llvm.aarch64.neon.fmax.f16(half, half)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
index ba8e616eeeb0..9a52e214d82d 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_3op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+neon,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+neon,+fullfp16  | FileCheck %s
 
 define dso_local half @t_vfmah_f16(half %a, half %b, half %c) {
 ; CHECK-LABEL: t_vfmah_f16:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
index becbbdd45434..58cbc2953dbc 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
 declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
index 1674d8627920..e29919a8a9d4 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_2op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
index c8a33a6cf5c2..8d52d2a428d0 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_3op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index 9635b888f0da..db1105d613cd 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @fpext_f32_f64(float %a) {
 ; CHECK-LABEL: fpext_f32_f64:
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index a55c0dbffae2..1dd5450c271c 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @pow_f64(double %a, double %b) {
 ; CHECK-LABEL: pow_f64:
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 677d2e0416ae..b496c7d15eef 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @powi_f64(double %a, i32 %b) {
 ; CHECK-LABEL: powi_f64:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 23ba85d54c7a..251719c1e3b4 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 ; CHECK-GI-FP16:       warning: Instruction selection used fallback path for fptos_v2f16_v2i16
 ; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for fptou_v2f16_v2i16
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 813fa0396e78..9425988af834 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define float @fptrunc_f64_f32(double %a) {
 ; CHECK-LABEL: fptrunc_f64_f32:
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index eb26128d6469..03caf0a33eb4 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @frem_f64(double %a, double %b) {
 ; CHECK-LABEL: frem_f64:
diff --git a/llvm/test/CodeGen/AArch64/frintn.ll b/llvm/test/CodeGen/AArch64/frintn.ll
index 2dc03db39a1a..782ba1d2dc78 100644
--- a/llvm/test/CodeGen/AArch64/frintn.ll
+++ b/llvm/test/CodeGen/AArch64/frintn.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-eabi -mattr=+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 %s -o - | FileCheck %s
 
 ; The llvm.aarch64.neon.frintn intrinsic should be auto-upgraded to the
 ; target-independent roundeven intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index aef0b2e29243..2c76d969d6ef 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define double @sin_f64(double %a) {
 ; CHECK-LABEL: sin_f64:
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 76930e76399f..683544a69ebe 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define double @sqrt_f64(double %a) {
 ; CHECK-LABEL: sqrt_f64:
diff --git a/llvm/test/CodeGen/AArch64/hints.ll b/llvm/test/CodeGen/AArch64/hints.ll
index f23c7b00f224..61a3fa4f07c7 100644
--- a/llvm/test/CodeGen/AArch64/hints.ll
+++ b/llvm/test/CodeGen/AArch64/hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s
 
 declare void @llvm.aarch64.hint(i32) nounwind
 
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 789bc99810ae..2e8c93a00a0d 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
 ; CHECK-LABEL: i64_i64:
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
new file mode 100644
index 000000000000..6074d44cb03d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -0,0 +1,2256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for insert_v2f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16f16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v32i8_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v16i16_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v8i32_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v2i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v3i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for insert_v4i64_c
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_0
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v32i8_c
+
+define <2 x double> @insert_v2f64_0(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[0], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 0
+  ret <2 x double> %d
+}
+
+define <2 x double> @insert_v2f64_1(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 1
+  ret <2 x double> %d
+}
+
+define <2 x double> @insert_v2f64_c(<2 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v2f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #3, #1
+; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x double> %a, double %b, i32 %c
+  ret <2 x double> %d
+}
+
+define <3 x double> @insert_v3f64_0(<3 x double> %a, double %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    mov v0.d[0], v3.d[0]
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 0
+  ret <3 x double> %d
+}
+
+define <3 x double> @insert_v3f64_2(<3 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v3f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d2, d3
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 2
+  ret <3 x double> %d
+}
+
+define <3 x double> @insert_v3f64_c(<3 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v3f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x0, #0x3
+; CHECK-NEXT:    str d3, [x8, x9, lsl #3]
+; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldr d2, [sp, #16]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x double> %a, double %b, i32 %c
+  ret <3 x double> %d
+}
+
+define <4 x double> @insert_v4f64_0(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 0
+  ret <4 x double> %d
+}
+
+define <4 x double> @insert_v4f64_2(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[0], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 2
+  ret <4 x double> %d
+}
+
+define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) {
+; CHECK-LABEL: insert_v4f64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str d2, [x9, x8, lsl #3]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x double> %a, double %b, i32 %c
+  ret <4 x double> %d
+}
+
+define <2 x float> @insert_v2f32_0(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 0
+  ret <2 x float> %d
+}
+
+define <2 x float> @insert_v2f32_1(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 1
+  ret <2 x float> %d
+}
+
+define <2 x float> @insert_v2f32_c(<2 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v2f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #2, #1
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x float> %a, float %b, i32 %c
+  ret <2 x float> %d
+}
+
+define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 0
+  ret <3 x float> %d
+}
+
+define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3f32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3f32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 2
+  ret <3 x float> %d
+}
+
+define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v3f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x float> %a, float %b, i32 %c
+  ret <3 x float> %d
+}
+
+define <4 x float> @insert_v4f32_0(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 0
+  ret <4 x float> %d
+}
+
+define <4 x float> @insert_v4f32_2(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 2
+  ret <4 x float> %d
+}
+
+define <4 x float> @insert_v4f32_c(<4 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v4f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    str s1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x float> %a, float %b, i32 %c
+  ret <4 x float> %d
+}
+
+define <8 x float> @insert_v8f32_0(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 0
+  ret <8 x float> %d
+}
+
+define <8 x float> @insert_v8f32_2(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 2
+  ret <8 x float> %d
+}
+
+define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) {
+; CHECK-LABEL: insert_v8f32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x7
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x float> %a, float %b, i32 %c
+  ret <8 x float> %d
+}
+
+define <4 x half> @insert_v4f16_0(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 0
+  ret <4 x half> %d
+}
+
+define <4 x half> @insert_v4f16_2(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 2
+  ret <4 x half> %d
+}
+
+define <4 x half> @insert_v4f16_c(<4 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v4f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x half> %a, half %b, i32 %c
+  ret <4 x half> %d
+}
+
+define <8 x half> @insert_v8f16_0(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 0
+  ret <8 x half> %d
+}
+
+define <8 x half> @insert_v8f16_2(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 2
+  ret <8 x half> %d
+}
+
+define <8 x half> @insert_v8f16_c(<8 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v8f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #1, #3
+; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x half> %a, half %b, i32 %c
+  ret <8 x half> %d
+}
+
+define <16 x half> @insert_v16f16_0(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 0
+  ret <16 x half> %d
+}
+
+define <16 x half> @insert_v16f16_2(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 2
+  ret <16 x half> %d
+}
+
+define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) {
+; CHECK-LABEL: insert_v16f16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0xf
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str h2, [x9, x8, lsl #1]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x half> %a, half %b, i32 %c
+  ret <16 x half> %d
+}
+
+define <8 x i8> @insert_v8i8_0(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 0
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @insert_v8i8_2(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 2
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @insert_v8i8_c(<8 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfxil x8, x1, #0, #3
+; CHECK-NEXT:    strb w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i8> %a, i8 %b, i32 %c
+  ret <8 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_0(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 0
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_2(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 2
+  ret <16 x i8> %d
+}
+
+define <16 x i8> @insert_v16i8_c(<16 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfxil x8, x1, #0, #4
+; CHECK-NEXT:    strb w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i8> %a, i8 %b, i32 %c
+  ret <16 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_0(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 0
+  ret <32 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_2(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.b[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 2
+  ret <32 x i8> %d
+}
+
+define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) {
+; CHECK-LABEL: insert_v32i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x1f
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    strb w0, [x9, x8]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <32 x i8> %a, i8 %b, i32 %c
+  ret <32 x i8> %d
+}
+
+define <4 x i16> @insert_v4i16_0(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 0
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @insert_v4i16_2(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 2
+  ret <4 x i16> %d
+}
+
+define <4 x i16> @insert_v4i16_c(<4 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x1, #1, #2
+; CHECK-NEXT:    strh w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i16> %a, i16 %b, i32 %c
+  ret <4 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_0(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 0
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_2(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 2
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @insert_v8i16_c(<8 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #1, #3
+; CHECK-NEXT:    strh w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i16> %a, i16 %b, i32 %c
+  ret <8 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_0(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 0
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_2(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.h[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 2
+  ret <16 x i16> %d
+}
+
+define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) {
+; CHECK-LABEL: insert_v16i16_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0xf
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    strh w0, [x9, x8, lsl #1]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <16 x i16> %a, i16 %b, i32 %c
+  ret <16 x i16> %d
+}
+
+define <2 x i32> @insert_v2i32_0(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 0
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @insert_v2i32_1(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 1
+  ret <2 x i32> %d
+}
+
+define <2 x i32> @insert_v2i32_c(<2 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    bfi x8, x1, #2, #1
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i32> %a, i32 %b, i32 %c
+  ret <2 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 0
+  ret <3 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v0.s[2], w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 2
+  ret <3 x i32> %d
+}
+
+define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #2, #2
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i32> %a, i32 %b, i32 %c
+  ret <3 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_0(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 0
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_2(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 2
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @insert_v4i32_c(<4 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #2, #2
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i32> %a, i32 %b, i32 %c
+  ret <4 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_0(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[0], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 0
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_2(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[2], w0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 2
+  ret <8 x i32> %d
+}
+
+define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) {
+; CHECK-LABEL: insert_v8i32_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x7
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str w0, [x9, x8, lsl #2]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <8 x i32> %a, i32 %b, i32 %c
+  ret <8 x i32> %d
+}
+
+define <2 x i64> @insert_v2i64_0(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 0
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @insert_v2i64_1(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[1], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 1
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @insert_v2i64_c(<2 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v2i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x1, #3, #1
+; CHECK-NEXT:    str x0, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <2 x i64> %a, i64 %b, i32 %c
+  ret <2 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_0(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-SD-LABEL: insert_v3i64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    mov v0.d[0], x0
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: insert_v3i64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 0
+  ret <3 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_2(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d2, x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 2
+  ret <3 x i64> %d
+}
+
+define <3 x i64> @insert_v3i64_c(<3 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v3i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x1, #0x3
+; CHECK-NEXT:    str x0, [x8, x9, lsl #3]
+; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldr d2, [sp, #16]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <3 x i64> %a, i64 %b, i32 %c
+  ret <3 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_0(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 0
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_2(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], x0
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 2
+  ret <4 x i64> %d
+}
+
+define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) {
+; CHECK-LABEL: insert_v4i64_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x1, #0x3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str x0, [x9, x8, lsl #3]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+entry:
+  %d = insertelement <4 x i64> %a, i64 %b, i32 %c
+  ret <4 x i64> %d
+}
+
+define double @extract_v2f64_0(<2 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v2f64_1(<2 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f64_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 1
+  ret double %d
+}
+
+define double @extract_v2f64_c(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x double> %a, i32 %c
+  ret double %d
+}
+
+define double @extract_v3f64_0(<3 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v3f64_2(<3 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 2
+  ret double %d
+}
+
+define double @extract_v3f64_c(<3 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    stp q0, q2, [sp]
+; CHECK-GI-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x double> %a, i32 %c
+  ret double %d
+}
+
+define double @extract_v4f64_0(<4 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 0
+  ret double %d
+}
+
+define double @extract_v4f64_2(<4 x double> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 2
+  ret double %d
+}
+
+define double @extract_v4f64_c(<4 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr d0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x double> %a, i32 %c
+  ret double %d
+}
+
+define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v2f32_1(<2 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v2f32_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 1
+  ret float %d
+}
+
+define float @extract_v2f32_c(<2 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #1
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v3f32_0(<3 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v3f32_2(<3 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v3f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v3f32_c(<3 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v4f32_0(<4 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v4f32_2(<4 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v4f32_c(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr s0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x float> %a, i32 %c
+  ret float %d
+}
+
+define float @extract_v8f32_0(<8 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 0
+  ret float %d
+}
+
+define float @extract_v8f32_2(<8 x float> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f32_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s0, v0.s[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 2
+  ret float %d
+}
+
+define float @extract_v8f32_c(<8 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8f32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x7
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8f32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x float> %a, i32 %c
+  ret float %d
+}
+
+define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f16_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v4f16_2(<4 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v4f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v4f16_c(<4 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #2
+; CHECK-SD-NEXT:    ldr h0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr h0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x half> %a, i32 %c
+  ret half %d
+}
+
+define half @extract_v8f16_0(<8 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v8f16_2(<8 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v8f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v8f16_c(<8 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #3
+; CHECK-SD-NEXT:    ldr h0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    ldr h0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x half> %a, i32 %c
+  ret half %d
+}
+
+define half @extract_v16f16_0(<16 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v16f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 0
+  ret half %d
+}
+
+define half @extract_v16f16_2(<16 x half> %a, i32 %c) {
+; CHECK-LABEL: extract_v16f16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 2
+  ret half %d
+}
+
+define half @extract_v16f16_c(<16 x half> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16f16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0xf
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr h0, [x9, x8, lsl #1]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16f16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    ldr h0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x half> %a, i32 %c
+  ret half %d
+}
+
+define i8 @extract_v8i8_0(<8 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v8i8_2(<8 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v8i8_c(<8 x i8> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfxil x8, x0, #0, #3
+; CHECK-SD-NEXT:    ldrb w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    lsl x10, x9, #1
+; CHECK-GI-NEXT:    sub x9, x10, x9
+; CHECK-GI-NEXT:    ldrb w0, [x8, x9]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_0(<16 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_2(<16 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v16i8_c(<16 x i8> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16i8_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfxil x8, x0, #0, #4
+; CHECK-SD-NEXT:    ldrb w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16i8_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0xf
+; CHECK-GI-NEXT:    lsl x10, x9, #1
+; CHECK-GI-NEXT:    sub x9, x10, x9
+; CHECK-GI-NEXT:    ldrb w0, [x8, x9]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_0(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 0
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_2(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.b[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 2
+  ret i8 %d
+}
+
+define i8 @extract_v32i8_c(<32 x i8> %a, i32 %c) {
+; CHECK-LABEL: extract_v32i8_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x1f
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldrb w0, [x9, x8]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <32 x i8> %a, i32 %c
+  ret i8 %d
+}
+
+define i16 @extract_v4i16_0(<4 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v4i16_2(<4 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v4i16_c(<4 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #2
+; CHECK-SD-NEXT:    ldrh w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldrh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_0(<8 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_2(<8 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v8i16_c(<8 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #1, #3
+; CHECK-SD-NEXT:    ldrh w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x7
+; CHECK-GI-NEXT:    ldrh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_0(<16 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 0
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_2(<16 x i16> %a, i32 %c) {
+; CHECK-LABEL: extract_v16i16_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w0, v0.h[2]
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 2
+  ret i16 %d
+}
+
+define i16 @extract_v16i16_c(<16 x i16> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v16i16_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0xf
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldrh w0, [x9, x8, lsl #1]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v16i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0xf
+; CHECK-GI-NEXT:    ldrh w0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <16 x i16> %a, i32 %c
+  ret i16 %d
+}
+
+define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v2i32_1(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v2i32_c(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str d0, [sp, #8]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #1
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    add x8, sp, #8
+; CHECK-GI-NEXT:    str d0, [sp, #8]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_0(<3 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v3i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_2(<3 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v3i32_c(<3 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_0(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_2(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_c(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_0(<8 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v8i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_2(<8 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i32_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i32_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v8i32_c(<8 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v8i32_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x7
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v8i32_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x7
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <8 x i32> %a, i32 %c
+  ret i32 %d
+}
+
+define i64 @extract_v2i64_0(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_1(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_c(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_0(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_2(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    fmov x0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x0, d2
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 2
+  ret i64 %d
+}
+
+define i64 @extract_v3i64_c(<3 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v3i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v3i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    stp q0, q2, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <3 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_0(<4 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i64_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 0
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_2(<4 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i64_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d1
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 2
+  ret i64 %d
+}
+
+define i64 @extract_v4i64_c(<4 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i64_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    mov x9, sp
+; CHECK-SD-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i64_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sub x9, sp, #48
+; CHECK-GI-NEXT:    mov x29, sp
+; CHECK-GI-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    stp q0, q1, [sp]
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    mov sp, x29
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i64> %a, i32 %c
+  ret i64 %d
+}
diff --git a/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
index 61939977c4d7..ea4205f51fee 100644
--- a/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -O=3 | FileCheck %s
 
 define void @test() {
   ; CHECK: dmb sy
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fa1ab61a6216..85689b6b2a60 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for stofp_v3i8_v3f64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f64
diff --git a/llvm/test/CodeGen/AArch64/kcfi-bti.ll b/llvm/test/CodeGen/AArch64/kcfi-bti.ll
index 12cde4371e15..d3febb536824 100644
--- a/llvm/test/CodeGen/AArch64/kcfi-bti.ll
+++ b/llvm/test/CodeGen/AArch64/kcfi-bti.ll
@@ -73,11 +73,11 @@ define void @f3(ptr noundef %x) {
 ; MIR-LABEL: name: f3
 ; MIR: body:
 
-; ISEL: TCRETURNriBTI %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678
+; ISEL: TCRETURNrix16x17 %1, 0, csr_aarch64_aapcs, implicit $sp, cfi-type 12345678
 
 ; KCFI:       BUNDLE{{.*}} {
 ; KCFI-NEXT:    KCFI_CHECK $x16, 12345678, implicit-def $x9, implicit-def $x16, implicit-def $x17, implicit-def $nzcv
-; KCFI-NEXT:    TCRETURNriBTI internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp
+; KCFI-NEXT:    TCRETURNrix16x17 internal killed $x16, 0, csr_aarch64_aapcs, implicit $sp
 ; KCFI-NEXT:  }
 
   tail call void %x() [ "kcfi"(i32 12345678) ]
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
index cf638356dfdf..91cf605613b9 100644
--- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -aarch64-enable-stp-suppress=false -verify-machineinstrs -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -aarch64-enable-stp-suppress=false -verify-machineinstrs -asm-verbose=false | FileCheck %s
 
 ; CHECK-LABEL: test_strd_sturd:
 ; CHECK-NEXT: stp d0, d1, [x0, #-8]
diff --git a/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll b/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
index a96a3c5f4881..a949abb3ee5a 100644
--- a/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
+++ b/llvm/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=bogus
+; RUN: llc < %s -mtriple=aarch64 -mcpu=bogus
 
 ; Fix the bug in PR20557. Set mcpu to a bogus name, llc will crash in type
 ; legalization.
diff --git a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
index 871bc3b0a323..23011dfc2d4f 100644
--- a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=aarch64-eabi -stop-after=finalize-isel -o - | FileCheck --check-prefix=MIR %s
+; RUN: llc %s -mtriple=aarch64 -stop-after=finalize-isel -o - | FileCheck --check-prefix=MIR %s
 
 ; Ensure the scoped AA metadata is still retained after store merging.
 
diff --git a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
index fc5813b99ea5..3c42987ade20 100644
--- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu cortex-a53 -mtriple=aarch64-eabi | FileCheck %s --check-prefix=A53
+; RUN: llc < %s -mcpu cortex-a53 -mtriple=aarch64 | FileCheck %s --check-prefix=A53
 
 ; PR26827 - Merge stores causes wrong dependency.
 %struct1 = type { ptr, ptr, i32, i32, i16, i16, ptr, ptr }
diff --git a/llvm/test/CodeGen/AArch64/merge-store.ll b/llvm/test/CodeGen/AArch64/merge-store.ll
index f7201030da2c..b93d0c3bc960 100644
--- a/llvm/test/CodeGen/AArch64/merge-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone -mattr=+slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,SPLITTING
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=-slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,MISALIGNED
+; RUN: llc < %s -mtriple=aarch64 -mattr=-slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,MISALIGNED
 
 @g0 = external dso_local global <3 x float>, align 16
 @g1 = external dso_local global <3 x float>, align 4
diff --git a/llvm/test/CodeGen/AArch64/min-max-combine.ll b/llvm/test/CodeGen/AArch64/min-max-combine.ll
index 535d2bad2967..5111f838b73a 100644
--- a/llvm/test/CodeGen/AArch64/min-max-combine.ll
+++ b/llvm/test/CodeGen/AArch64/min-max-combine.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK-GLOBAL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK-ISEL
+; RUN: llc -mtriple=aarch64 %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - -mattr=cssc | FileCheck %s --check-prefixes=CHECK-CSSC
 
 ; These tests check for @llvm.smax, @llvm.smin combines.
 
diff --git a/llvm/test/CodeGen/AArch64/min-max.ll b/llvm/test/CodeGen/AArch64/min-max.ll
index 8914406f1db7..0d02f1e7b548 100644
--- a/llvm/test/CodeGen/AArch64/min-max.ll
+++ b/llvm/test/CodeGen/AArch64/min-max.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
-; RUN: llc -mtriple=aarch64-eabi %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL-CSSC
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
-; RUN: llc -mtriple=aarch64-eabi -global-isel %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL-CSSC
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
+; RUN: llc -mtriple=aarch64 %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL-CSSC
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - -mattr=+cssc | FileCheck %s --check-prefixes=CHECK,CHECK-GLOBAL-CSSC
 
 ; These tests just check that the plumbing is in place for @llvm.smax, @llvm.umax,
 ; @llvm.smin, @llvm.umin.
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
index cddcd4653bc5..a75c30317f13 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -13,6 +13,8 @@
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2     | FileCheck %s
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s
 
 @g = common local_unnamed_addr global ptr null, align 8
 
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index e16ee40c8dcb..90e560af4465 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-eabi -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL
 
 ; Convert mul x, pow2 to shift.
 ; Convert mul x, pow2 +/- 1 to shift + add/sub.
diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
index 23d1e43a5cab..cf09a46000da 100644
--- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll
@@ -7,6 +7,7 @@
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2  < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1      < %s | FileCheck %s
 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a     < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b     < %s | FileCheck %s
 
 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
index d79c07205555..b3692f8f46fb 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple aarch64-none-eabi -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -o - | FileCheck %s
 
 define <4 x i16> @addls_v8i8_0(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: addls_v8i8_0:
diff --git a/llvm/test/CodeGen/AArch64/neon_rbit.ll b/llvm/test/CodeGen/AArch64/neon_rbit.ll
index 0daaf72f93dd..e66aca7613cb 100644
--- a/llvm/test/CodeGen/AArch64/neon_rbit.ll
+++ b/llvm/test/CodeGen/AArch64/neon_rbit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-eabi -mattr=+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 %s -o - | FileCheck %s
 
 ; The llvm.aarch64_neon_rbit intrinsic should be auto-upgraded to the
 ; target-independent bitreverse intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
index 4fe16d8195c0..b7dde881291b 100644
--- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW
-; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m3         -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: llc < %s -mtriple=aarch64 -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,SLOW
+; RUN: llc < %s -mtriple=aarch64 -mcpu=exynos-m3         -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST
 
 ; CHECK-LABEL: test_nopair_st
 ; SLOW: str
diff --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll
index 9bc4ccf2787a..c40e529ccab1 100644
--- a/llvm/test/CodeGen/AArch64/nzcv-save.ll
+++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-eabi | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 | FileCheck %s
 
 ; DAG ends up with two uses for the flags from an ADCS node, which means they
 ; must be saved for later.
diff --git a/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll b/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
index 1858f3b566f8..ba4772178211 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-module-attrs.ll
@@ -1,4 +1,4 @@
-;; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.5a %s -o - | FileCheck %s
+;; RUN: llc -mtriple=aarch64 -mattr=+v8.5a %s -o - | FileCheck %s
 
 declare i32 @g(i32) #5
 
diff --git a/llvm/test/CodeGen/AArch64/postra-mi-sched.ll b/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
index 7688973a1f2f..5abc06bd6ba6 100644
--- a/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
+++ b/llvm/test/CodeGen/AArch64/postra-mi-sched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=aarch64-eabi -mcpu=cortex-a53 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=aarch64 -mcpu=cortex-a53 | FileCheck %s
 
 ; With cortex-a53, each of fmul and fcvt have latency of 6 cycles.  After the
 ; pre-RA MI scheduler, fmul, fcvt and fdiv will be consecutive.  The top-down
diff --git a/llvm/test/CodeGen/AArch64/rbit.ll b/llvm/test/CodeGen/AArch64/rbit.ll
index 288a25bd65e3..8c457b4480f8 100644
--- a/llvm/test/CodeGen/AArch64/rbit.ll
+++ b/llvm/test/CodeGen/AArch64/rbit.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; The llvm.aarch64.rbit intrinsic should be auto-upgraded to the
 ; target-independent bitreverse intrinsic.
diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
index b9d93942d0f1..47e338151749 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+sve < %s | FileCheck %s
 
 ; Show what happens with RCPC3 for extract/insert into SVE vectors.
 ; Currently there is no RCPC3 codegen expected for this.
diff --git a/llvm/test/CodeGen/AArch64/rcpc3.ll b/llvm/test/CodeGen/AArch64/rcpc3.ll
index d41613953b3c..45770337c132 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck --check-prefixes=BOTH,RCPC3 %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a < %s | FileCheck --check-prefixes=BOTH,NO-RCPC3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck --check-prefixes=BOTH,RCPC3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.9a < %s | FileCheck --check-prefixes=BOTH,NO-RCPC3 %s
 
 define hidden <2 x i64> @test_ldap1_2xi64_lane0(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr {
 ;
diff --git a/llvm/test/CodeGen/AArch64/rem_crash.ll b/llvm/test/CodeGen/AArch64/rem_crash.ll
index f9cf6d58370c..38b46eae9f53 100644
--- a/llvm/test/CodeGen/AArch64/rem_crash.ll
+++ b/llvm/test/CodeGen/AArch64/rem_crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi
+; RUN: llc < %s -mtriple=aarch64
 
 define i8 @test_minsize_uu8(i8 %x) minsize optsize {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll
index 483c4d71ee21..704c87feb6a9 100644
--- a/llvm/test/CodeGen/AArch64/remat.ll
+++ b/llvm/test/CodeGen/AArch64/remat.ll
@@ -26,6 +26,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1a -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1b -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
 declare void @f(ptr)
diff --git a/llvm/test/CodeGen/AArch64/rotate.ll b/llvm/test/CodeGen/AArch64/rotate.ll
index 8d52b6dd3a79..7b4808e7d466 100644
--- a/llvm/test/CodeGen/AArch64/rotate.ll
+++ b/llvm/test/CodeGen/AArch64/rotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 ;; This used to cause a backend crash about not being able to
 ;; select ROTL. Make sure if generates the basic ushr/shl.
diff --git a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
index 9e9c814be026..46b714d8e5fb 100644
--- a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
+++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
 
 define i1 @load_bv_v4i8(i1 zeroext %a) {
 ; CHECK-LABEL: load_bv_v4i8:
diff --git a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
index 9e5e55512df8..5f3b1503b46b 100644
--- a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
+++ b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
@@ -5,7 +5,7 @@
 ; inserted at a point where LR is live.
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-arm-none-eabi"
+target triple = "aarch64"
 
 define hidden void @_ZTv0_n24_N2C6D1Ev(ptr %this) minsize sspreq "target-features"="+harden-sls-retbr" {
 ; CHECK-LABEL: _ZTv0_n24_N2C6D1Ev:
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
index 0d64e7378b55..5f833e32fb8c 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
 
 ; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index a3b8df487ed4..d9ad1045506c 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs                                   | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs                                   | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
 
 ; Dynamically-sized allocation, needs a loop which can handle any size at
 ; runtime. The final iteration of the loop will temporarily put SP below the
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 03a9220ebfdd..1ad78709d501 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
 
 ; Test prolog sequences for stack probing when SVE objects are involved.
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
index df5408de5bab..8fc90cfd71b1 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
 
 ; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
index d2315844dc2f..f15807597ac2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve -o - | FileCheck --check-prefixes=CHECK,CHECK-NO-EXTEND-ROUND %s
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK-EXTEND-ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve -o - | FileCheck --check-prefixes=CHECK,CHECK-NO-EXTEND-ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK-EXTEND-ROUND %s
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ;============ v2f32
diff --git a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
index 7f65997fda7a..14cc8cd26826 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK,CHECK_NO_EXTEND_ROUND %s
-; RUN: llc < %s -mtriple aarch64-eabi -mattr=+sve2 --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK_EXTEND_ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -o - | FileCheck --check-prefixes=CHECK,CHECK_NO_EXTEND_ROUND %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 --combiner-vector-fcopysign-extend-round -o - | FileCheck --check-prefixes=CHECK,CHECK_EXTEND_ROUND %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll b/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
index 54c200ec4c3d..675380787af4 100644
--- a/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
+++ b/llvm/test/CodeGen/AArch64/tailmerging_in_mbp.ll
@@ -1,4 +1,4 @@
-; RUN: llc <%s -mtriple=aarch64-eabi -verify-machine-dom-info | FileCheck %s
+; RUN: llc <%s -mtriple=aarch64 -verify-machine-dom-info | FileCheck %s
 
 ; CHECK-LABEL: test:
 ; CHECK-LABEL: %cond.false12.i
diff --git a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
index 1edea229cfe8..d301a380dbb8 100644
--- a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O1 -mtriple=aarch64-eabi -aarch64-enable-cond-br-tune=false | FileCheck %s
+; RUN: llc < %s -O1 -mtriple=aarch64 -aarch64-enable-cond-br-tune=false | FileCheck %s
 
 declare void @t()
 
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index a3c9c8fbe0a3..f6553b6acec9 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
 ; CHECK-LABEL: convert_v3f32
 ; CHECK: strb
diff --git a/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll b/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
index 1979d97952cb..85187f1ee4ca 100644
--- a/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.5a  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.5a  | FileCheck %s
 
 declare <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float>)
 declare <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll b/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
index b4a0f3d557c6..728c13da5f1b 100644
--- a/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/v8.5a-scalar-frint3264-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.5a  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.5a  | FileCheck %s
 
 declare float @llvm.aarch64.frint32z.f32(float)
 declare double @llvm.aarch64.frint32z.f64(double)
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 1b22e2f900dd..557aa010b3a7 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -489,3 +489,31 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
   %bitmask = bitcast <6 x i1> %cmp_result to i6
   ret i6 %bitmask
 }
+
+; Only apply the combine when casting a vector to a scalar.
+define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
+; CHECK-LABEL: vector_to_vector_cast:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:  Lloh36:
+; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-NEXT:  Lloh37:
+; CHECK-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-NEXT:    add x8, sp, #14
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    str h0, [sp, #14]
+; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-NEXT:    orr x8, x8, #0x1
+; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+  %bc = bitcast <16 x i1> %arg to <2 x i8>
+  ret <2 x i8> %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 20237705cc6e..1164e02a16c9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: add_HalfS:
@@ -13,11 +13,7 @@ define float @add_HalfS(<2 x float> %bin.rdx)  {
 ;
 ; CHECK-GI-LABEL: add_HalfS:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    fadd s0, s0, s2
+; CHECK-GI-NEXT:    faddp s0, v0.2s
 ; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
   ret float %r
@@ -82,15 +78,12 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI1_0]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
@@ -202,22 +195,18 @@ define half @add_H(<8 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h0
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[4]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[6]
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
@@ -225,44 +214,23 @@ define half @add_H(<8 x half> %bin.rdx)  {
 }
 
 define float @add_S(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov s1, v0.s[2]
-; CHECK-SD-NEXT:    faddp s2, v0.2s
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s1, s2, s1
-; CHECK-SD-NEXT:    fadd s0, s1, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    fadd s1, s1, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s1, s1, s2
-; CHECK-GI-NEXT:    fadd s1, s1, s3
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    faddp s2, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
 define double @add_D(<2 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: add_D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    faddp d0, v0.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    fadd d0, d1, d0
-; CHECK-GI-NEXT:    fadd d0, d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
   ret double %r
 }
@@ -464,23 +432,19 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: add_2H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI5_0
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-FP16-NEXT:    ldr h2, [x8, :lo12:.LCPI5_0]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h0
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    faddp h3, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fadd h2, h3, h2
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[4]
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[5]
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
 ; CHECK-GI-FP16-NEXT:    fadd h2, h2, h3
 ; CHECK-GI-FP16-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-FP16-NEXT:    fadd h2, h2, h4
 ; CHECK-GI-FP16-NEXT:    fadd h0, h2, h0
 ; CHECK-GI-FP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
@@ -502,95 +466,51 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 }
 
 define float @add_2S(<8 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_2S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov s2, v0.s[2]
-; CHECK-SD-NEXT:    faddp s3, v0.2s
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s2, s3, s2
-; CHECK-SD-NEXT:    mov s3, v1.s[2]
-; CHECK-SD-NEXT:    fadd s0, s2, s0
-; CHECK-SD-NEXT:    mov s2, v1.s[1]
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    mov s1, v1.s[3]
-; CHECK-SD-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NEXT:    fadd s0, s0, s3
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_2S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.2s, #128, lsl #24
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov s4, v0.s[2]
-; CHECK-GI-NEXT:    fadd s2, s2, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s2, s2, s3
-; CHECK-GI-NEXT:    mov s3, v1.s[2]
-; CHECK-GI-NEXT:    fadd s2, s2, s4
-; CHECK-GI-NEXT:    fadd s0, s2, s0
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    fadd s0, s0, s1
-; CHECK-GI-NEXT:    mov s1, v1.s[3]
-; CHECK-GI-NEXT:    fadd s0, s0, s2
-; CHECK-GI-NEXT:    fadd s0, s0, s3
-; CHECK-GI-NEXT:    fadd s0, s0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v0.s[2]
+; CHECK-NEXT:    faddp s3, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s2, s3, s2
+; CHECK-NEXT:    mov s3, v1.s[2]
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
 define double @add_2D(<4 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: add_2D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    faddp d0, v0.2d
-; CHECK-SD-NEXT:    mov d2, v1.d[1]
-; CHECK-SD-NEXT:    fadd d0, d0, d1
-; CHECK-SD-NEXT:    fadd d0, d0, d2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_2D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    fadd d0, d2, d0
-; CHECK-GI-NEXT:    mov d2, v1.d[1]
-; CHECK-GI-NEXT:    fadd d0, d0, d3
-; CHECK-GI-NEXT:    fadd d0, d0, d1
-; CHECK-GI-NEXT:    fadd d0, d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    fadd d0, d0, d2
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
   ret double %r
 }
 
 ; Added at least one test where the start value is not -0.0.
 define float @add_S_init_42(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: add_S_init_42:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #1109917696 // =0x42280000
-; CHECK-SD-NEXT:    mov s2, v0.s[1]
-; CHECK-SD-NEXT:    mov s3, v0.s[2]
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    fadd s1, s0, s1
-; CHECK-SD-NEXT:    mov s0, v0.s[3]
-; CHECK-SD-NEXT:    fadd s1, s1, s2
-; CHECK-SD-NEXT:    fadd s1, s1, s3
-; CHECK-SD-NEXT:    fadd s0, s1, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_S_init_42:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v0.s[2]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fadd s1, s1, s0
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fadd s1, s1, s2
-; CHECK-GI-NEXT:    fadd s1, s1, s3
-; CHECK-GI-NEXT:    fadd s0, s1, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov s2, v0.s[1]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fadd s1, s0, s1
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s1, s2
+; CHECK-NEXT:    fadd s1, s1, s3
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
   ret float %r
 }
@@ -604,5 +524,3 @@ declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index aaba379ad63d..99c6808724b5 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-LABEL: add_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index 32ce4d6eb967..e1b21705c95f 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: mul_HalfS:
@@ -13,11 +13,9 @@ define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ;
 ; CHECK-GI-LABEL: mul_HalfS:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s1, #1.00000000
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s1, s0
-; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
@@ -80,14 +78,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h1, #1.00000000
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
@@ -193,9 +189,7 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h1, #1.00000000
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h0
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[4]
@@ -208,37 +202,21 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 }
 
 define float @mul_S(<4 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul s1, s0, v0.s[1]
-; CHECK-SD-NEXT:    fmul s1, s1, v0.s[2]
-; CHECK-SD-NEXT:    fmul s0, s1, v0.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s1, #1.00000000
-; CHECK-GI-NEXT:    fmul s1, s1, s0
-; CHECK-GI-NEXT:    fmul s1, s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v0.s[2]
-; CHECK-GI-NEXT:    fmul s0, s1, v0.s[3]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s1, s0, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
 define double @mul_D(<2 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d1, #1.00000000
-; CHECK-GI-NEXT:    fmul d1, d1, d0
-; CHECK-GI-NEXT:    fmul d0, d1, v0.d[1]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
   ret double %r
 }
@@ -427,9 +405,7 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ;
 ; CHECK-GI-FP16-LABEL: mul_2H:
 ; CHECK-GI-FP16:       // %bb.0:
-; CHECK-GI-FP16-NEXT:    fmov h2, #1.00000000
-; CHECK-GI-FP16-NEXT:    fmul h2, h2, h0
-; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    fmul h2, h2, v0.h[4]
@@ -450,49 +426,27 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 }
 
 define float @mul_2S(<8 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_2S:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul s2, s0, v0.s[1]
-; CHECK-SD-NEXT:    fmul s2, s2, v0.s[2]
-; CHECK-SD-NEXT:    fmul s0, s2, v0.s[3]
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[1]
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[2]
-; CHECK-SD-NEXT:    fmul s0, s0, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_2S:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s2, #1.00000000
-; CHECK-GI-NEXT:    fmul s2, s2, s0
-; CHECK-GI-NEXT:    fmul s2, s2, v0.s[1]
-; CHECK-GI-NEXT:    fmul s2, s2, v0.s[2]
-; CHECK-GI-NEXT:    fmul s0, s2, v0.s[3]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[2]
-; CHECK-GI-NEXT:    fmul s0, s0, v1.s[3]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s0, v0.s[1]
+; CHECK-NEXT:    fmul s2, s2, v0.s[2]
+; CHECK-NEXT:    fmul s0, s2, v0.s[3]
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    fmul s0, s0, v1.s[1]
+; CHECK-NEXT:    fmul s0, s0, v1.s[2]
+; CHECK-NEXT:    fmul s0, s0, v1.s[3]
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
 }
 
 define double @mul_2D(<4 x double> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_2D:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmul d0, d0, v0.d[1]
-; CHECK-SD-NEXT:    fmul d0, d0, d1
-; CHECK-SD-NEXT:    fmul d0, d0, v1.d[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_2D:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d2, #1.00000000
-; CHECK-GI-NEXT:    fmul d2, d2, d0
-; CHECK-GI-NEXT:    fmul d0, d2, v0.d[1]
-; CHECK-GI-NEXT:    fmul d0, d0, d1
-; CHECK-GI-NEXT:    fmul d0, d0, v1.d[1]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d0, d0, v1.d[1]
+; CHECK-NEXT:    ret
   %r = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
   ret double %r
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 67b4ebb33824..e85384e46222 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-SD-LABEL: mul_HalfS:
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index c602837a6e6c..d050eaf6646d 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64-none-eabi -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
 
 define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; SHA3-LABEL: xar:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 25cee8724497..b04bc04ab226 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -338,8 +338,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -350,8 +350,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -428,8 +428,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -495,8 +495,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -507,8 +507,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -563,8 +563,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32:
@@ -574,8 +574,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -629,8 +629,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset:
@@ -640,8 +640,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -696,8 +696,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system:
@@ -707,8 +707,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -780,8 +780,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -792,8 +792,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -861,8 +861,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec v0, v1, s[0:1] offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64:
@@ -872,8 +872,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[0:1] offset:20
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -937,8 +937,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -952,8 +952,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1023,8 +1023,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1038,8 +1038,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1110,8 +1110,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -1183,8 +1183,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32:
@@ -1196,8 +1196,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -1256,8 +1256,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -1330,8 +1330,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system:
@@ -1343,8 +1343,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -1430,8 +1430,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1450,8 +1450,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:20
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
@@ -1628,8 +1628,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1644,8 +1644,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -1729,8 +1729,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1745,8 +1745,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64:
@@ -1821,8 +1821,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -1885,8 +1885,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset:
@@ -1899,8 +1899,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1964,8 +1964,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system:
@@ -1978,8 +1978,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -2075,8 +2075,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2097,8 +2097,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2183,8 +2183,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
@@ -2201,8 +2201,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
@@ -2651,8 +2651,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2664,8 +2664,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2734,8 +2734,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2747,8 +2747,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2818,8 +2818,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2831,8 +2831,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2891,8 +2891,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64:
@@ -2903,8 +2903,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -2962,8 +2962,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset:
@@ -2974,8 +2974,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system:
@@ -3046,8 +3046,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -3123,8 +3123,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3136,8 +3136,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3209,8 +3209,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64:
@@ -3221,8 +3221,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 5e013371d873..f6a997fb0fb0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -338,8 +338,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -350,8 +350,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -428,8 +428,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -495,8 +495,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -507,8 +507,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -563,8 +563,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32:
@@ -574,8 +574,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -629,8 +629,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset:
@@ -640,8 +640,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -696,8 +696,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system:
@@ -707,8 +707,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1] offset:16
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -780,8 +780,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -792,8 +792,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -861,8 +861,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc v0, v1, s[0:1] offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -872,8 +872,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[0:1] offset:20
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -1322,8 +1322,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1335,8 +1335,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1405,8 +1405,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1418,8 +1418,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1489,8 +1489,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1562,8 +1562,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64:
@@ -1574,8 +1574,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1645,8 +1645,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1705,8 +1705,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system:
@@ -1717,8 +1717,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1794,8 +1794,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1880,8 +1880,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1892,8 +1892,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -1957,8 +1957,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -1972,8 +1972,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2043,8 +2043,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -2058,8 +2058,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2130,8 +2130,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
@@ -2145,8 +2145,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -2203,8 +2203,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32:
@@ -2216,8 +2216,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
@@ -2276,8 +2276,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -2289,8 +2289,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -2350,8 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system:
@@ -2363,8 +2363,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -2450,8 +2450,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -2470,8 +2470,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -2552,8 +2552,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -2569,8 +2569,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:20
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
@@ -2744,8 +2744,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2760,8 +2760,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -2845,8 +2845,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2861,8 +2861,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -2947,8 +2947,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2963,8 +2963,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64:
@@ -3039,8 +3039,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
@@ -3103,8 +3103,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -3117,8 +3117,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -3182,8 +3182,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system:
@@ -3196,8 +3196,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -3293,8 +3293,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -3315,8 +3315,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -3401,8 +3401,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -3419,8 +3419,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
new file mode 100644
index 000000000000..5bea13af1649
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+define void @main(<19 x i32> %arg) {
+; GCN-LABEL: main:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_mov_b32 s12, s4
+; GCN-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s13, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_mov_b32 s6, s4
+; GCN-NEXT:    s_mov_b32 s7, s4
+; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    s_mov_b32 s9, s4
+; GCN-NEXT:    s_mov_b32 s10, s4
+; GCN-NEXT:    s_mov_b32 s11, s4
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    image_store v[0:3], v[4:5], s[4:11] unorm
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: main:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_mov_b32 s10, s4
+; GFX10-NEXT:    s_mov_b32 s11, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s10
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s11
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_mov_b32 s6, s4
+; GFX10-NEXT:    s_mov_b32 s7, s4
+; GFX10-NEXT:    s_mov_b32 s8, s4
+; GFX10-NEXT:    s_mov_b32 s9, s4
+; GFX10-NEXT:    image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: main:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_mov_b32 s6, s0
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v5, s7
+; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    s_mov_b32 s2, s0
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s0
+; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = bitcast <19 x i32> %arg to <38 x i16>
+  %i1 = extractelement <38 x i16> %i, i64 0
+  %i2 = icmp eq i16 %i1, 0
+  %i3 = zext i1 %i2 to i32
+  %i4 = bitcast i32 %i3 to float
+  %i5 = insertelement <4 x float> zeroinitializer, float %i4, i64 0
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %i5, i32 0, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+  ret void
+}
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10PLUS: {{.*}}
+; GPRIDX: {{.*}}
+; MOVREL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a..24652982c658 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -65,6 +65,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
@@ -95,25 +97,23 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; MUBUF-NEXT:    s_nop 0
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:12
+; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:16
+; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:20
+; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:24
+; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:28
+; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:32
+; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:36
+; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:40
+; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:44
+; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:48
+; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:52
+; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:56
+; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:60
 ; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
@@ -160,6 +160,7 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
@@ -175,16 +176,15 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:128
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0
 ; FLATSCR-NEXT:    s_nop 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:16
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:24
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:32
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:40
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:48
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:56
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:64
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:8
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:16
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:24
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:32
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:40
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:48
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:56
 ; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 9580326d7b78..0d793654f7ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @stack_write_fi() {
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index ac153183be64..1e1c90d142a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2626,6 +2626,132 @@ entry:
   ret double %ext
 }
 
+define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v9, s10
+; GCN-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, s12
+; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_mov_b32_e32 v13, s14
+; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    ; kill: def $vgpr15 killed $sgpr2 killed $exec
+; GCN-NEXT:    ; kill: def $vgpr16 killed $sgpr3 killed $exec
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    s_mov_b32 s0, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s14
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+entry:
+  %bc = bitcast <14 x float> %userData to <7 x double>
+  %ext = extractelement <7 x double> %bc, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s10
+; GCN-NEXT:    s_mov_b32 s1, s11
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GFX10PLUS:       ; %bb.0: ; %entry
+; GFX10PLUS-NEXT:    s_mov_b32 s0, s10
+; GFX10PLUS-NEXT:    s_mov_b32 s1, s11
+; GFX10PLUS-NEXT:    ; return to shader part epilog
+entry:
+  %.bc = bitcast <14 x i32> %userData to <7 x i64>
+  %ext = extractelement <7 x i64> %.bc, i32 4
+  ret i64 %ext
+}
+
 define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v7f64_s_v:
 ; GCN:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index dcad707acaf2..b4b95fdab4ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @amdgpu_ps() {
 ; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
 ; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT:    v_mov_b32_e32 v0, 4
+; MESA-NEXT:    v_mov_b32_e32 v0, 0
 ; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
@@ -24,7 +24,7 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_getpc_b64 s[2:3]
 ; PAL-NEXT:    s_mov_b32 s2, s0
 ; PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; PAL-NEXT:    v_mov_b32_e32 v0, 4
+; PAL-NEXT:    v_mov_b32_e32 v0, 0
 ; PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 75065f677b65..921bdb5015c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -15,11 +15,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -36,8 +36,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -51,12 +51,12 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -69,10 +69,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -87,9 +87,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -109,12 +109,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -129,8 +129,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -143,9 +143,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -156,9 +156,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0, v1
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
@@ -169,9 +169,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:128 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -324,16 +324,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -345,15 +345,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -363,42 +363,42 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v2, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:260 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:256 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -408,9 +408,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -432,16 +432,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -468,15 +468,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -485,12 +485,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:256 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
@@ -500,12 +500,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:384 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -708,7 +708,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -718,9 +718,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -812,12 +812,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16512 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1003,11 +1003,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -1133,7 +1133,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -1146,7 +1146,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
@@ -1160,7 +1160,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -1173,9 +1173,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index faab70cac39c..a1c99f5cf602 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -11,7 +11,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x10
 ; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
@@ -35,189 +35,189 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v14, s50
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:260
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:264
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:288
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:292
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:296
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:300
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:304
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:308
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:312
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:316
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:12
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:16
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:20
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:24
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:28
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:36
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:40
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:44
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:48
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:52
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:56
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:60
 ; GCN-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:320
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
 ; GCN-NEXT:    v_mov_b32_e32 v0, s53
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:324
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
 ; GCN-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:328
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
 ; GCN-NEXT:    v_mov_b32_e32 v0, s55
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:332
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
 ; GCN-NEXT:    v_mov_b32_e32 v0, s56
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:336
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
 ; GCN-NEXT:    v_mov_b32_e32 v0, s57
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:340
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
 ; GCN-NEXT:    v_mov_b32_e32 v0, s58
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:344
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
 ; GCN-NEXT:    v_mov_b32_e32 v0, s59
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:348
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
 ; GCN-NEXT:    v_mov_b32_e32 v0, s60
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:352
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
 ; GCN-NEXT:    v_mov_b32_e32 v0, s61
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:356
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
 ; GCN-NEXT:    v_mov_b32_e32 v0, s62
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:360
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
 ; GCN-NEXT:    v_mov_b32_e32 v0, s63
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:364
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
 ; GCN-NEXT:    v_mov_b32_e32 v0, s64
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:368
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
 ; GCN-NEXT:    v_mov_b32_e32 v0, s65
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:372
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
 ; GCN-NEXT:    v_mov_b32_e32 v0, s66
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:376
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
 ; GCN-NEXT:    v_mov_b32_e32 v0, s67
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:380
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:384
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:388
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:392
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:136
 ; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:396
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:140
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:400
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:144
 ; GCN-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:404
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:148
 ; GCN-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:408
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:152
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:412
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:156
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:416
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:160
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:420
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:164
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:424
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:168
 ; GCN-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:428
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:172
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:432
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:176
 ; GCN-NEXT:    v_mov_b32_e32 v0, s17
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:436
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:180
 ; GCN-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:440
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:184
 ; GCN-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:444
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:188
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:448
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:192
 ; GCN-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:452
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:196
 ; GCN-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:456
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:200
 ; GCN-NEXT:    v_mov_b32_e32 v0, s39
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:460
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:204
 ; GCN-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:464
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:208
 ; GCN-NEXT:    v_mov_b32_e32 v0, s41
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:468
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:212
 ; GCN-NEXT:    v_mov_b32_e32 v0, s42
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:472
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:216
 ; GCN-NEXT:    v_mov_b32_e32 v0, s43
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:476
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:220
 ; GCN-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:480
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:224
 ; GCN-NEXT:    v_mov_b32_e32 v0, s45
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:484
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:228
 ; GCN-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:488
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:232
 ; GCN-NEXT:    v_mov_b32_e32 v0, s47
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:492
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:236
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:240
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:244
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-NEXT:    s_and_b32 s4, s25, 63
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:248
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:508
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:252
 ; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s24
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:260
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:264
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:284
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:288
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:292
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:296
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:300
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:304
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:308
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:312
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:316
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:320
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:324
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:328
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:332
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:336
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:340
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:344
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:348
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:352
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:356
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:360
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:364
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:368
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:372
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:376
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:380
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:384
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:388
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:392
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:396
-; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:400
-; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:404
-; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:408
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:412
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:416
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:420
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:424
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:428
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:432
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:436
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:440
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:444
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:448
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:452
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:456
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:460
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:464
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:468
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:472
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:476
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:480
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:484
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:488
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:492
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:496
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:500
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:8
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:12
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:16
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:20
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:24
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:28
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:32
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:36
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:40
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:44
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:48
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:52
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:56
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:60
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:64
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:68
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:72
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:76
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:80
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:84
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:88
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:92
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:96
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:100
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:104
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:108
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:112
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:116
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:120
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:124
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:132
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:136
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:140
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:144
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:148
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:152
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:156
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:160
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:164
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:168
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:172
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:176
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:180
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:184
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:188
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:192
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:196
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:200
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:204
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:208
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:212
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:216
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:220
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:224
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:228
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:232
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:236
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:240
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:244
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:248
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:252
 ; GCN-NEXT:    s_waitcnt vmcnt(60)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[20:21]
 ; GCN-NEXT:    s_waitcnt vmcnt(57)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 84da311108ce..014534ab79fe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -50,7 +50,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -88,7 +88,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -127,7 +127,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 30975a8937db..dcf9e169f586 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -59,7 +59,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -97,7 +97,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -136,7 +136,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
index 10766b0f79d8..25652b69afa9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
@@ -299,129 +299,6 @@ body: |
     S_NOP 0, implicit %12
 ...
 ---
-name: legal_v13s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
-    ; CHECK-LABEL: name: legal_v13s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<13 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<13 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(<13 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12
-    S_NOP 0, implicit %13
-...
----
-name: legal_v14s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
-    ; CHECK-LABEL: name: legal_v14s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<14 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<14 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(s32) = COPY $vgpr13
-    %14:_(<14 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13
-    S_NOP 0, implicit %14
-...
----
-name: legal_v15s32
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-    ; CHECK-LABEL: name: legal_v15s32
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<15 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
-    ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<15 x s32>)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = COPY $vgpr3
-    %4:_(s32) = COPY $vgpr4
-    %5:_(s32) = COPY $vgpr5
-    %6:_(s32) = COPY $vgpr6
-    %7:_(s32) = COPY $vgpr7
-    %8:_(s32) = COPY $vgpr8
-    %9:_(s32) = COPY $vgpr9
-    %10:_(s32) = COPY $vgpr10
-    %11:_(s32) = COPY $vgpr11
-    %12:_(s32) = COPY $vgpr12
-    %13:_(s32) = COPY $vgpr13
-    %14:_(s32) = COPY $vgpr14
-    %15:_(<15 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14
-    S_NOP 0, implicit %15
-...
----
 name: legal_v16s32
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 155f4c92f97d..9698a3894db6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,11 +1,12 @@
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs  < %s | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs| FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
@@ -200,4 +201,4 @@ attributes #1 = { nounwind }
 !2 = !{i32 1, i32 1, i32 64}
 
 !llvm.module.flags = !{!99}
-!99 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!99 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 03cd8332bb3a..2727fdec035d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -27,32 +27,32 @@ define amdgpu_kernel void @system_one_as_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") acquire
@@ -115,32 +115,32 @@ define amdgpu_kernel void @system_one_as_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") acq_rel
@@ -164,32 +164,32 @@ define amdgpu_kernel void @system_one_as_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("one-as") seq_cst
@@ -329,32 +329,32 @@ define amdgpu_kernel void @agent_one_as_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") acquire
@@ -417,32 +417,32 @@ define amdgpu_kernel void @agent_one_as_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") acq_rel
@@ -466,32 +466,32 @@ define amdgpu_kernel void @agent_one_as_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent-one-as") seq_cst
@@ -769,32 +769,32 @@ define amdgpu_kernel void @system_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence acquire
@@ -857,32 +857,32 @@ define amdgpu_kernel void @system_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence acq_rel
@@ -906,32 +906,32 @@ define amdgpu_kernel void @system_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: system_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: system_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: system_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence seq_cst
@@ -1071,32 +1071,32 @@ define amdgpu_kernel void @agent_acquire() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_acquire
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_acquire
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_acquire
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") acquire
@@ -1159,32 +1159,32 @@ define amdgpu_kernel void @agent_acq_rel() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") acq_rel
@@ -1208,32 +1208,32 @@ define amdgpu_kernel void @agent_seq_cst() {
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: agent_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX10CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: agent_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: agent_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 7
   ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
-  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; GFX11CU-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 3e572f95ab9a..c92b78cd4557 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -365,8 +365,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, use@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, use@abs32@hi
 ; GISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -378,8 +378,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, use@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, use@abs32@hi
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:4
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -391,8 +391,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s1, use@abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, use@abs32@lo
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 16
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4
-; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v0, off
+; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; DAGISEL-GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
@@ -404,8 +404,8 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use@abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, use@abs32@lo
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:4
-; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v0, off, s[48:51], 0
+; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; DAGISEL-GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; DAGISEL-GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; DAGISEL-GFX10-NEXT:    s_endpgm
@@ -867,7 +867,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, 1
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v0
+; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 0, v0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
@@ -882,7 +882,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 4
-; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
+; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v1, v0, s[48:51], 0 offen
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
@@ -898,7 +898,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
+; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 0
 ; DAGISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; DAGISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX11-NEXT:    s_endpgm
@@ -907,7 +907,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
-; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
+; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 4190d073e256..8d9ed9bb4343 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -181,13 +181,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -198,13 +198,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -215,13 +215,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
@@ -232,13 +232,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee@abs32@lo
@@ -254,13 +254,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -271,13 +271,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -288,13 +288,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -305,13 +305,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -327,7 +327,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -336,7 +336,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
@@ -349,7 +349,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -358,7 +358,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
@@ -370,7 +370,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -379,7 +379,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -392,7 +392,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -401,7 +401,7 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
@@ -422,8 +422,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_clause 0x1
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:4
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -433,8 +433,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX11-NEXT:    s_clause 0x1
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4
-; GISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:8
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off
+; GISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:4
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
@@ -442,8 +442,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -453,8 +453,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_callee@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; GISEL-GFX10-NEXT:    s_clause 0x1
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4
-; GISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:8
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0
+; GISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:4
 ; GISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; GISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; GISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
@@ -463,8 +463,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_clause 0x1
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -474,8 +474,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX11-NEXT:    s_clause 0x1
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:8
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v11, off, off offset:4
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX11-NEXT:    s_setpc_b64 s[4:5]
@@ -483,8 +483,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -494,8 +494,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_callee@abs32@lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v11
 ; DAGISEL-GFX10-NEXT:    s_clause 0x1
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:8
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v11, off, s[48:51], 0 offset:4
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s0, s3
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 exec_lo, -1
 ; DAGISEL-GFX10-NEXT:    s_setpc_b64 s[4:5]
@@ -508,13 +508,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
 ; GISEL-GFX11-NEXT:    s_nop
 ; GISEL-GFX11-NEXT:    ;;#ASMEND
-; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; GISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; GISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
@@ -525,13 +525,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
 ; GISEL-GFX10-NEXT:    s_nop
 ; GISEL-GFX10-NEXT:    ;;#ASMEND
-; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; GISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
 ; GISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
@@ -542,13 +542,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX11-NEXT:    s_nop
 ; DAGISEL-GFX11-NEXT:    ;;#ASMEND
-; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX11-NEXT:    scratch_load_b32 v16, off, off ; 4-byte Folded Reload
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
@@ -559,13 +559,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
 ; DAGISEL-GFX10-NEXT:    s_nop
 ; DAGISEL-GFX10-NEXT:    ;;#ASMEND
-; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Reload
+; DAGISEL-GFX10-NEXT:    buffer_load_dword v16, off, s[48:51], 0 ; 4-byte Folded Reload
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, chain_preserve_callee_2@abs32@hi
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v8, v1
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s4, chain_preserve_callee_2@abs32@lo
@@ -592,7 +592,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, 1
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v0
+; GISEL-GFX11-NEXT:    v_add_nc_u32_e32 v4, 0, v0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
@@ -607,7 +607,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 4
-; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 32, v0
+; GISEL-GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v1, v0, s[48:51], 0 offen
 ; GISEL-GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GISEL-GFX10-NEXT:    buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
@@ -623,7 +623,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
+; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 0
 ; DAGISEL-GFX11-NEXT:    scratch_store_b128 v4, v[0:3], off dlc
 ; DAGISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; DAGISEL-GFX11-NEXT:    s_endpgm
@@ -632,7 +632,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ; DAGISEL-GFX10:       ; %bb.0:
 ; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
-; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
+; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v4, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index ff2f2c69f850..93c18de8ca62 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -209,8 +209,8 @@ for.end:
 
 ; R600-VECT: MOVA_INT
 
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:2 ; encoding: [0x02,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x68,0xe0
 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
 ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0
 
@@ -238,8 +238,8 @@ entry:
 ; SI-PROMOTE-VECT-DAG: s_lshl_b32
 ; SI-PROMOTE-VECT-DAG: v_lshrrev
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:1 ; encoding: [0x01,0x00,0x60,0xe0
 define amdgpu_kernel void @char_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8], addrspace(5)
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}no_overlap:
 ;
 ; A total of 5 bytes should be allocated and used.
-; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ;
+; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
 define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 954994afad3c..d33196ba3310 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -12,7 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-LABEL: {{^}}test_private_array_ptr_calc:
 
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b5cccb6c4bb9..8ee0ee3b27ba 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -99,8 +99,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -132,8 +132,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -167,8 +167,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB0_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -203,8 +203,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB0_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -420,8 +420,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -455,8 +455,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -492,8 +492,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB1_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
@@ -530,8 +530,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB1_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
@@ -767,8 +767,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB2_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -812,8 +812,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB2_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
@@ -861,8 +861,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB2_4:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -910,8 +910,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB2_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1128,8 +1128,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB3_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -1163,8 +1163,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB3_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -1200,8 +1200,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB3_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1237,8 +1237,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB3_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1488,8 +1488,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -1529,8 +1529,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1572,8 +1572,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB4_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1618,8 +1618,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB4_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1777,8 +1777,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
@@ -1797,8 +1797,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
@@ -1954,8 +1954,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -1988,8 +1988,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -2024,8 +2024,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB6_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2061,8 +2061,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB6_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2281,8 +2281,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB7_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -2317,8 +2317,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB7_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -2355,8 +2355,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB7_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2394,8 +2394,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB7_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2634,8 +2634,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB8_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -2679,8 +2679,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB8_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
@@ -2728,8 +2728,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB8_4:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2777,8 +2777,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB8_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB9_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -3072,8 +3072,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB9_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -3112,8 +3112,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB9_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3152,8 +3152,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB9_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3415,8 +3415,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
+; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB10_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -3459,8 +3459,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
+; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB10_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -3505,8 +3505,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
+; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB10_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3553,8 +3553,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_mov_b32 s9, s7
 ; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
+; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB10_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3722,8 +3722,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
@@ -3742,8 +3742,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index e64f8c23c061..19a1d2d9dbd1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -75,8 +75,8 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    buffer_gl1_inv
+; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1100-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 87e17a1c8208..863bd0d8c752 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4610,11 +4610,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
-; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_movk_i32 s32, 0x400
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4637,11 +4637,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
-; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 8
-; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_movk_i32 s32, 0x400
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4664,12 +4664,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4691,9 +4691,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -4706,11 +4706,11 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
-; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
-; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4741,11 +4741,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
-; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; VI-NEXT:    s_movk_i32 s32, 0x800
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4756,10 +4756,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; VI-NEXT:    v_mov_b32_e32 v0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, 8
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -4778,11 +4778,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
-; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 8
-; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; CI-NEXT:    s_movk_i32 s32, 0x800
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4793,10 +4793,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; CI-NEXT:    v_mov_b32_e32 v0, 16
+; CI-NEXT:    v_mov_b32_e32 v0, 8
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt vmcnt(1)
@@ -4815,12 +4815,12 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4831,10 +4831,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -4852,16 +4852,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
-; GFX11-NEXT:    v_mov_b32_e32 v0, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:16
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:20
+; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:8
+; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
@@ -4881,11 +4881,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
-; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
-; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4895,10 +4895,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; HSA-NEXT:    v_mov_b32_e32 v0, 16
+; HSA-NEXT:    v_mov_b32_e32 v0, 8
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:16
-; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:20
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index c74d5efaef99..1d2523d364e5 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -509,13 +509,13 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; Local stack object initialize. Offset 0 is the emergency spill slot.
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN-DAG: s_movk_i32 s32, 0x400
-; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; GCN: buffer_store_dword [[K]], off, s[0:3], 0
 
 ; Pass %arg31 on stack
 ; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
 ; GCN: buffer_store_dword [[K1:v[0-9]+]], off, s[0:3], s32{{$}}
 
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0
 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 6d603ef03976..9f535a94e61f 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -529,16 +529,16 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; FIXEDABI: v_mov_b32_e32 v31, v0
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0{{$}}
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
 
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 
 ; FIXME: Why this reload?
-; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0{{$}}
 
 ; FIXEDABI-NOT: s32
-; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
+; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32
 ; FIXEDABI: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
   %alloca = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index f9b44f45be30..927e45f02948 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}store_fi_lifetime:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @store_fi_lifetime(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
@@ -14,7 +14,7 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off,
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
@@ -27,16 +27,16 @@ define amdgpu_kernel void @stored_fi_to_lds(ptr addrspace(3) %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
 ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO]]
 
-; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
 define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
@@ -51,9 +51,9 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %pt
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca ptr addrspace(5), addrspace(5)
 
@@ -65,13 +65,13 @@ define amdgpu_kernel void @stored_fi_to_self() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}}
 
-; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2048{{$}}
 define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32], addrspace(5)
   %tmp1 = alloca ptr addrspace(5), addrspace(5)
@@ -86,15 +86,15 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 
-; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
-; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca ptr addrspace(5), addrspace(5)
   %tmp1 = alloca ptr addrspace(5), addrspace(5)
@@ -104,14 +104,14 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
   store volatile ptr addrspace(5) inttoptr (i32 9999 to ptr addrspace(5)), ptr addrspace(5) %tmp2
 
 
-  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 4 at offset 8
-  store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 8 at offset 4
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 0 at offset 4
+  store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 4 at offset 0
   ret void
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
@@ -122,14 +122,14 @@ define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
@@ -178,7 +178,7 @@ define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(ptr addrspace(1
 ; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+12
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @cannot_select_assertzext_valuetype(ptr addrspace(1) %out, i32 %idx) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index ca09163b20af..6f42fd0aff13 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -30,7 +30,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX803-NEXT:    s_add_u32 s0, s0, s7
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX900-NEXT:    s_add_u32 s0, s0, s7
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -48,14 +48,14 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_kern_stack:
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1100-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v0, off dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_endpgm
 entry:
@@ -160,7 +160,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
-; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_getpc_b64 s[16:17]
 ; GFX803-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -181,7 +181,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
-; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_getpc_b64 s[16:17]
 ; GFX900-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -204,7 +204,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -223,7 +223,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
-; GFX1100-NEXT:    scratch_store_b32 off, v1, off offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v1, off dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[6:7]
 ; GFX1100-NEXT:    s_add_u32 s6, s6, ex@rel32@lo+4
@@ -270,7 +270,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX803-NEXT:    s_mov_b32 s33, 0
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX803-NEXT:    v_mov_b32_e32 v0, 0
-; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_endpgm
 ;
@@ -280,7 +280,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX900-NEXT:    s_mov_b32 s33, 0
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_endpgm
 ;
@@ -290,7 +290,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_endpgm
 ;
@@ -298,7 +298,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
-; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_endpgm
 entry:
@@ -426,7 +426,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
-; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_getpc_b64 s[16:17]
 ; GFX803-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -448,7 +448,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
-; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_getpc_b64 s[16:17]
 ; GFX900-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -472,7 +472,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
+; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
@@ -492,7 +492,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
-; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 offset:4 dlc
+; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[6:7]
 ; GFX1100-NEXT:    s_add_u32 s6, s6, ex@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index cd36f6a0cbd9..56159195a963 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -48,13 +48,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN_DBG-NEXT:    s_mov_b64 exec, -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -62,7 +62,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -87,13 +87,13 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -151,13 +151,13 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -165,7 +165,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -190,7 +190,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
@@ -239,13 +239,13 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -253,7 +253,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -278,7 +278,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
@@ -328,13 +328,13 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -342,7 +342,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
@@ -365,7 +365,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
@@ -441,13 +441,13 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 ; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
@@ -455,7 +455,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
-; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 3
@@ -481,7 +481,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 3
 ; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index afb73572a0c3..49f9f695409b 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -136,8 +136,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}}
 ; GCN: {{^}}.LBB4_2:
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
 entry:
@@ -166,7 +166,8 @@ done:
   ret void
 }
 
-; This ends up not fitting due to the reserved 4 bytes at offset 0
+; This used to be a special case when the scavenge slot was
+; fixed at offset 0.
 ; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
 ; OPT-NOT:  getelementptr [512 x i32]
 ; OPT: br i1
@@ -174,10 +175,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
 ; GCN: s_and_saveexec_b64
-; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
-; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
-; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
-; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
 ; GCN: {{^.LBB[0-9]+}}_2:
 
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index dfc8361c60bb..397efb126053 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -432,22 +432,22 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -464,19 +464,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:4
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:6
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:8
+; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:6
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR-NEXT:    s_endpgm
@@ -490,24 +490,24 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX10_DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10_DEFAULT-NEXT:    s_clause 0x1
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:6
-; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
+; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10_DEFAULT-NEXT:    s_endpgm
@@ -524,21 +524,21 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:4
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:6
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:8
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    s_clause 0x1
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:4
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:6
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR_GFX10-NEXT:    s_endpgm
@@ -550,19 +550,19 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:6 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:2 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:6
+; GFX11-NEXT:    scratch_load_b32 v0, off, off
+; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 7404015891c8..bc8f3ebfd5d2 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
 ; REQUIRES: asserts
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s
 ; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5,COV56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6,COV56 %s
 
 ; AMDGPUAttributor deletes the function "by accident" so it's never
 ; codegened with optimizations.
@@ -20,8 +20,7 @@
 ; OPT-NEXT: amdhsa.version:
 ; OPT-NEXT: - 1
 ; COV4: - 1
-; COV5: - 2
-; COV6: - 3
+; COV56: - 2
 ; OPT: ...
 define internal i32 @func() {
   ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 8bd60aa9af8d..6422beeec088 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -51,21 +51,21 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -73,12 +73,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -98,7 +98,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -106,12 +106,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -130,7 +130,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB0_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB0_4: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -225,21 +225,21 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -247,12 +247,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -272,7 +272,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -280,12 +280,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -305,7 +305,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB1_3: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -315,7 +315,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s2, v0, 4
@@ -323,7 +323,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -341,7 +341,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_branch .LBB1_3
 ; GCN-O0-NEXT:  .LBB1_5: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 3
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 0
@@ -436,7 +436,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -445,7 +445,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -468,7 +468,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -476,9 +476,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
@@ -488,7 +488,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_2
@@ -496,7 +496,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_2: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -506,19 +506,19 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -538,12 +538,12 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -562,7 +562,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_5: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
@@ -571,7 +571,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB2_6: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -684,11 +684,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s0, v1
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0
@@ -707,9 +707,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
 ; GCN-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
@@ -725,7 +725,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_1
@@ -733,7 +733,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_1: ; %Flow2
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
@@ -743,18 +743,18 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -772,15 +772,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_7
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.inner.then
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -797,11 +797,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -818,15 +818,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 6
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_6
 ; GCN-O0-NEXT:  ; %bb.5: ; %bb.inner.then2
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -842,7 +842,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_6: ; %Flow
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 6
@@ -852,7 +852,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_7: ; %Flow1
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 4
@@ -861,7 +861,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:  .LBB3_8: ; %bb.outer.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
@@ -938,34 +938,34 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
 ; GCN-O0-NEXT:    v_cmp_gt_u32_e64 s[2:3], v1, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 3
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
@@ -983,7 +983,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:  .LBB4_2: ; %bb.end
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 2
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index 1d2e211bae04..d94e75c8c8e2 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -699,7 +699,7 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: {{^}}commute_frameindex:
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
 define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 6ef14d31ffa8..2b5a8d956034 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -26,7 +26,7 @@
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]]
 ; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]]
@@ -50,7 +50,7 @@
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
@@ -99,7 +99,7 @@ endif:
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 
 ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
@@ -123,7 +123,7 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
@@ -176,11 +176,11 @@ end:
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
-; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
-; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, [[CMP0]]
 
@@ -189,11 +189,11 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
+; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]]
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 357fcf8ef156..038219fc3740 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -108,6 +108,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX1200 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 < %s | FileCheck --check-prefixes=GFX1201 %s
 
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic < %s | FileCheck --check-prefixes=GFX10_3_GENERIC %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck --check-prefixes=GFX11_GENERIC %s
+
 ; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
 ; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
 ; GFX602: .amdgcn_target "amdgcn-amd-amdhsa--gfx602"
@@ -196,6 +203,13 @@
 ; GFX1200: .amdgcn_target "amdgcn-amd-amdhsa--gfx1200"
 ; GFX1201: .amdgcn_target "amdgcn-amd-amdhsa--gfx1201"
 
+; GFX9_GENERIC_NOXNACK:     .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-"
+; GFX9_GENERIC_XNACK:       .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+"
+; GFX10_1_GENERIC_NOXNACK:  .amdgcn_target "amdgcn-amd-amdhsa--gfx10.1-generic:xnack-"
+; GFX10_1_GENERIC_XNACK:    .amdgcn_target "amdgcn-amd-amdhsa--gfx10.1-generic:xnack+"
+; GFX10_3_GENERIC:          .amdgcn_target "amdgcn-amd-amdhsa--gfx10.3-generic"
+; GFX11_GENERIC:            .amdgcn_target "amdgcn-amd-amdhsa--gfx11-generic"
+
 define amdgpu_kernel void @directive_amdgcn_target() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 380439d8cd9c..9ba817694717 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -77,6 +77,11 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1200 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1200 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s
 
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10.1-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_1_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10.3-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_3_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx11-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX11_GENERIC %s
+
 ; FIXME: With the default attributes the eflags are not accurate for
 ; xnack and sramecc. Subsequent Target-ID patches will address this.
 
@@ -149,6 +154,11 @@
 ; GFX1151:       EF_AMDGPU_MACH_AMDGCN_GFX1151 (0x4A)
 ; GFX1200:       EF_AMDGPU_MACH_AMDGCN_GFX1200 (0x48)
 ; GFX1201:       EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
+
+; GFX9_GENERIC:       EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+; GFX10_1_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
+; GFX10_3_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
+; GFX11_GENERIC:      EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
 ; ALL:         ]
 
 define amdgpu_kernel void @elf_header() {
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index 5fda33759c8a..3802dc59d9fb 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i8_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -13,7 +13,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i8_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -24,7 +24,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
+; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
 define amdgpu_kernel void @load_i16_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
@@ -35,7 +35,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
 define amdgpu_kernel void @load_i16_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 51a4db132c59..1ba7e706d132 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -410,6 +410,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}bit4_extelt:
+; FIXME: One v_mov_b32_e32 vN, 0 should suffice
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-DAG: buffer_store_byte [[ZERO]],
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 7767aa54c815..9ae5f559e860 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -66,7 +66,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: ceil_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -87,7 +87,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: floor_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index bcf6bda807c8..57991d6cba94 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[0:1], src_private_base
-; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 4
+; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v2, 0
 ; FLAT_SCR_OPT-NEXT:    flat_store_dword v[0:1], v2
@@ -22,7 +22,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls:
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[0:1], src_private_base
-; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 4
+; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v2, 0
 ; FLAT_SCR_ARCH-NEXT:    flat_store_dword v[0:1], v2
@@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; FLAT_SCR_OPT-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s0 offset:4
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s0
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_OPT-NEXT:    s_endpgm
 ;
@@ -51,7 +51,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    v_mov_b32_e32 v0, 0
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s0 offset:4
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s0
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_ARCH-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
@@ -120,7 +120,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s2, 0
 ; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s3, 1
 ; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, 4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, 0
 ; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
@@ -221,7 +221,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMEND
 ; FLAT_SCR_OPT-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s0, 0
 ; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s105
@@ -243,7 +243,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s2, 0
 ; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s3, 1
 ; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, 4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, 0
 ; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
@@ -344,7 +344,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMEND
 ; FLAT_SCR_ARCH-NEXT:    s_or_saveexec_b32 s105, -1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s0, 0
 ; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s105
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 8284a772c380..0af57c6a97db 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -36,7 +36,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
@@ -58,7 +58,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -76,7 +76,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -95,7 +95,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -110,7 +110,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -138,7 +138,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -159,7 +159,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -184,7 +184,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -203,7 +203,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -221,7 +221,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -237,7 +237,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -265,7 +265,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff1_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
@@ -286,7 +286,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -311,7 +311,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -348,7 +348,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -364,7 +364,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -392,7 +392,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -414,7 +414,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
@@ -439,7 +439,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -458,7 +458,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -478,7 +478,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -495,7 +495,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -523,7 +523,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -544,7 +544,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -571,7 +571,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -591,7 +591,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -612,7 +612,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -629,7 +629,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -657,7 +657,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff2_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
@@ -678,7 +678,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -705,7 +705,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -725,7 +725,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -746,7 +746,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -763,7 +763,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -791,7 +791,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff1:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
@@ -813,7 +813,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -838,7 +838,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -857,7 +857,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -877,7 +877,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -894,7 +894,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -922,7 +922,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff2:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
@@ -943,7 +943,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff2:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,7 +970,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
@@ -990,7 +990,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -1028,7 +1028,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -1056,7 +1056,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-SDAG-LABEL: soff4_voff4:
 ; GFX940-SDAG:       ; %bb.0: ; %bb
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1076,7 +1076,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff4:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1103,7 +1103,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
@@ -1143,7 +1143,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
@@ -1160,7 +1160,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 84e3879786bd..687d84565692 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -23,10 +23,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_kernel:
@@ -43,10 +43,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_kernel:
@@ -59,10 +59,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: zero_init_kernel:
@@ -75,10 +75,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_kernel:
@@ -98,10 +98,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_kernel:
@@ -112,10 +112,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_kernel:
@@ -137,10 +137,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_kernel:
@@ -162,10 +162,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:52
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_kernel:
@@ -178,10 +178,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: zero_init_kernel:
@@ -194,10 +194,10 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:52
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
 ; GFX12-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
@@ -381,11 +381,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -402,8 +402,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -418,8 +418,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -434,8 +434,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -455,11 +455,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -471,11 +471,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -497,8 +497,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -513,8 +513,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -529,8 +529,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -552,13 +552,13 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s2, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -573,8 +573,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-NEXT:    s_add_i32 s1, s1, 4
-; GFX10-NEXT:    s_add_i32 s0, s0, 4
+; GFX10-NEXT:    s_add_i32 s1, s1, 0
+; GFX10-NEXT:    s_add_i32 s0, s0, 0
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
@@ -587,8 +587,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -601,8 +601,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -621,11 +621,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -634,12 +634,12 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_add_i32 s1, s1, 4
+; GFX940-NEXT:    s_add_i32 s1, s1, 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -659,8 +659,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -673,8 +673,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
+; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -687,8 +687,8 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -710,11 +710,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -727,8 +727,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -739,8 +739,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -750,8 +750,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -763,9 +763,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
@@ -780,9 +780,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -800,8 +800,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 0, v0
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -812,8 +812,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -823,8 +823,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -1064,7 +1064,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_mov_b32 s2, s0
@@ -1073,10 +1073,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_small_offset_kernel:
@@ -1085,7 +1085,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_mov_b32 s1, s0
@@ -1095,15 +1095,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_small_offset_kernel:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1113,15 +1113,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: zero_init_small_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1131,10 +1131,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1147,7 +1147,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1156,15 +1156,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_small_offset_kernel:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s0, 0
 ; GFX940-NEXT:    s_mov_b32 s1, s0
@@ -1172,10 +1172,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1190,7 +1190,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1199,10 +1199,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1216,7 +1216,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
@@ -1226,15 +1226,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1244,15 +1244,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1262,10 +1262,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:260
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
 ; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -1470,16 +1470,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1491,15 +1491,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1509,15 +1509,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1527,15 +1527,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1553,16 +1553,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
@@ -1570,17 +1570,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s0, 0x104
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -1598,15 +1598,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1625,15 +1625,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1643,15 +1643,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1661,15 +1661,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1694,16 +1694,16 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s2, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1714,14 +1714,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_and_b32 s0, s2, 15
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-NEXT:    s_addk_i32 s1, 0x104
-; GFX10-NEXT:    s_addk_i32 s0, 0x104
+; GFX10-NEXT:    s_addk_i32 s1, 0x100
+; GFX10-NEXT:    s_addk_i32 s0, 0x100
 ; GFX10-NEXT:    scratch_store_dword off, v0, s1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
@@ -1730,14 +1730,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1746,14 +1746,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1770,32 +1770,32 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
+; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_small_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX940-NEXT:    s_and_b32 s0, s0, 15
-; GFX940-NEXT:    s_addk_i32 s1, 0x104
+; GFX940-NEXT:    s_addk_i32 s1, 0x100
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_addk_i32 s0, 0x104
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -1812,14 +1812,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1837,14 +1837,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
@@ -1853,14 +1853,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
-; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
+; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
@@ -1869,14 +1869,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1901,14 +1901,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -1921,10 +1921,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -1934,10 +1934,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1946,10 +1946,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1967,25 +1967,25 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x100, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    v_sub_u32_e32 v0, 0x100, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
@@ -2004,10 +2004,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -2027,10 +2027,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
 ; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
@@ -2040,10 +2040,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2052,10 +2052,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -2305,7 +2305,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-LABEL: zero_init_large_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2441,7 +2441,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-PAL-LABEL: zero_init_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2451,10 +2451,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16388
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16404
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16420
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16436
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
 ; GFX12-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -2768,15 +2768,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -2902,15 +2902,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -2987,14 +2987,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -3110,14 +3110,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
+; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -3188,10 +3188,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -3296,10 +3296,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
@@ -3537,11 +3537,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -3642,11 +3642,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
@@ -3812,7 +3812,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
@@ -3834,7 +3834,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -3847,7 +3847,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -3860,9 +3860,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -3871,7 +3871,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -3889,7 +3889,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX940-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -3915,7 +3915,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
@@ -3928,7 +3928,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
@@ -3941,9 +3941,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
@@ -4732,11 +4732,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use v0
 ; GFX12-NEXT:    ;;#ASMEND
@@ -4850,11 +4850,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
-; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800
 ; GFX12-PAL-NEXT:    ;;#ASMSTART
 ; GFX12-PAL-NEXT:    ; use v0
 ; GFX12-PAL-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
index d7a1b2d29b26..17ec6f5b3724 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination-tied-operand.mir
@@ -22,7 +22,7 @@ body:             |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec
-    ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 4, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: renamable $vgpr0 = SCRATCH_LOAD_SHORT_D16_HI_ST 0, 0, killed renamable $vgpr0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: renamable $sgpr0 = S_LSHL_B32 killed renamable $sgpr0, 1, implicit-def dead $scc
     ; GFX11-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 028f32844576..eeddc2211ea9 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -310,7 +310,7 @@ ret:
 
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
+; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
 ; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
index dc6fea4ba1a3..1a9334706cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx9-generic --amdhsa-code-object-version=6 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 @gds0 = internal addrspace(2) global [4 x i32] undef, align 4
 @lds0 = internal addrspace(3) global [4 x i32] undef, align 128
diff --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
index 3e4e6938d72e..8d44330b1b97 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_ret_gds:
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
diff --git a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
new file mode 100644
index 000000000000..e3f4b14bac0c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
@@ -0,0 +1,18 @@
+; RUN: not llc -march=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX101-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx10.3-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX103-V5 %s
+; RUN: not llc -march=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX11-V5 %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx10.3-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -march=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - %s
+
+; GFX9-V5:   gfx9-generic is only available on code object version 6 or better
+; GFX101-V5: gfx10.1-generic is only available on code object version 6 or better
+; GFX103-V5: gfx10.3-generic is only available on code object version 6 or better
+; GFX11-V5:  gfx11-generic is only available on code object version 6 or better
+
+define void @foo() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 135285cca7f4..490167ee3635 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -156,8 +156,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -196,8 +196,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -357,8 +357,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -390,8 +390,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB1_2:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
@@ -506,8 +506,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -531,8 +531,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB2_2:
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
@@ -638,8 +638,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -663,8 +663,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB3_2:
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
@@ -806,8 +806,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -838,8 +838,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB4_2:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
@@ -1006,8 +1006,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1046,8 +1046,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -1325,8 +1325,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -1357,8 +1357,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_or_b32 s2, vcc_lo, s2
@@ -1631,8 +1631,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1669,8 +1669,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
@@ -1819,8 +1819,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
@@ -1857,8 +1857,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
index a8475385f6fd..6b4a6381d954 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
@@ -48,8 +48,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -75,8 +75,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -131,8 +131,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -158,8 +158,8 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -211,8 +211,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -235,8 +235,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -286,8 +286,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -310,8 +310,8 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -372,8 +372,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -403,8 +403,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -467,8 +467,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -498,8 +498,8 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -558,8 +558,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -585,8 +585,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -642,8 +642,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -669,8 +669,8 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -730,8 +730,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -757,8 +757,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -813,8 +813,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -840,8 +840,8 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -893,8 +893,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -917,8 +917,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -968,8 +968,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -992,8 +992,8 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1054,8 +1054,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1085,8 +1085,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1149,8 +1149,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1180,8 +1180,8 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1240,8 +1240,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1267,8 +1267,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -1324,8 +1324,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1351,8 +1351,8 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -1412,8 +1412,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1439,8 +1439,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1495,8 +1495,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1522,8 +1522,8 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1575,8 +1575,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1599,8 +1599,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1650,8 +1650,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1674,8 +1674,8 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -1736,8 +1736,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1767,8 +1767,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1831,8 +1831,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -1862,8 +1862,8 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1922,8 +1922,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -1949,8 +1949,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2006,8 +2006,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2033,8 +2033,8 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2094,8 +2094,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2121,8 +2121,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2177,8 +2177,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2204,8 +2204,8 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2257,8 +2257,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2281,8 +2281,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2332,8 +2332,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2356,8 +2356,8 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
@@ -2418,8 +2418,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2449,8 +2449,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2513,8 +2513,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
@@ -2544,8 +2544,8 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
 ; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2604,8 +2604,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2631,8 +2631,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
@@ -2688,8 +2688,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v3
@@ -2715,8 +2715,8 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v3
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 94d21bef94e7..a3c8bb141fd9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -17,16 +17,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -47,16 +47,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3] offset:2047
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3] offset:2047
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -78,16 +78,16 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace(
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[2:3] offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -108,16 +108,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -142,16 +142,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX10-NEXT:    global_atomic_swap v0, v[2:3], v1, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -173,16 +173,16 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_neg2048:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -222,8 +222,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
@@ -235,8 +235,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -270,8 +270,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
@@ -283,8 +283,8 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -319,8 +319,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
@@ -332,8 +332,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -366,8 +366,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX10-NEXT:    global_atomic_swap v0, v1, s[0:1] offset:42
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
@@ -379,8 +379,8 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] offset:42
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -410,16 +410,16 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -440,16 +440,16 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -471,16 +471,16 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -500,16 +500,16 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xchg_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -534,16 +534,16 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -564,16 +564,16 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -595,16 +595,16 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -624,16 +624,16 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -654,16 +654,16 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -684,16 +684,16 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_add_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -715,16 +715,16 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -744,16 +744,16 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_add_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -778,16 +778,16 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -808,16 +808,16 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -839,16 +839,16 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -868,16 +868,16 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -898,16 +898,16 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -928,16 +928,16 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -959,16 +959,16 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -988,16 +988,16 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_sub_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1022,16 +1022,16 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1052,16 +1052,16 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1083,16 +1083,16 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1112,16 +1112,16 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1142,16 +1142,16 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1172,16 +1172,16 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_and_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1203,16 +1203,16 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1232,16 +1232,16 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_and_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1266,16 +1266,16 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1296,16 +1296,16 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1327,16 +1327,16 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1356,16 +1356,16 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1386,16 +1386,16 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1416,16 +1416,16 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_or_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1447,16 +1447,16 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1476,16 +1476,16 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_or_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1510,16 +1510,16 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v0, v1, s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1540,16 +1540,16 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1571,16 +1571,16 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1600,16 +1600,16 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor v0, v1, s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b32 v0, v1, s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1630,16 +1630,16 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1660,16 +1660,16 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1691,16 +1691,16 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v0, v[1:2], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v0, v[1:2], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1720,16 +1720,16 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_xor_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2620,8 +2620,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn:
@@ -2629,8 +2629,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2654,8 +2654,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
@@ -2663,8 +2663,8 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2689,8 +2689,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn:
@@ -2698,8 +2698,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2721,8 +2721,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
@@ -2730,8 +2730,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2756,8 +2756,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn:
@@ -2766,8 +2766,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2793,8 +2793,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
@@ -2803,8 +2803,8 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2831,8 +2831,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn:
@@ -2841,8 +2841,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v0, v[3:6], s[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2866,8 +2866,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
@@ -2876,8 +2876,8 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX11-NEXT:    global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 6d99485b91fe..d9cbbc11f9a7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -3575,16 +3575,16 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i32:
@@ -3612,16 +3612,16 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128:
@@ -3650,16 +3650,16 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i64:
@@ -3687,16 +3687,16 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6eec8d5356ca..9c7ce392f09e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1227,9 +1227,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1271,9 +1271,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -1431,9 +1431,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1475,9 +1475,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2457,9 +2457,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2501,9 +2501,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -2661,9 +2661,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2705,9 +2705,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4355,9 +4355,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4399,9 +4399,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -4559,9 +4559,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4603,9 +4603,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c927a0e1ef06..11d35c5a5958 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1331,9 +1331,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1375,9 +1375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -1535,9 +1535,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -1579,9 +1579,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2561,9 +2561,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2605,9 +2605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -2765,9 +2765,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -2809,9 +2809,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4563,9 +4563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4607,9 +4607,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
@@ -4767,9 +4767,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
@@ -4811,9 +4811,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
-; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
new file mode 100644
index 000000000000..4fee563d1cc9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -0,0 +1,31 @@
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic < %s | FileCheck -check-prefix=CU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic < %s | FileCheck -check-prefix=CU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+cumode < %s | FileCheck -check-prefix=NOCU %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
+
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10.3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+
+; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
+
+; NOCU:    .amdhsa_workgroup_processor_mode 0
+; NOCU:    .workgroup_processor_mode: 0
+; CU:      .amdhsa_workgroup_processor_mode 1
+; CU:      .workgroup_processor_mode: 1
+
+; W64:      .amdhsa_wavefront_size32 0
+; W32:      .amdhsa_wavefront_size32 1
+
+define amdgpu_kernel void @wavefrontsize() {
+entry:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 600}
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 5882043caa0b..b9269e2b2428 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -7,12 +7,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 65535
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -20,7 +20,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
 ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
@@ -28,7 +28,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 131071
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -36,7 +36,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K-NOT: v_and_b32
 ; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
@@ -44,7 +44,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 262143
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -52,7 +52,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; SCRATCH128K-NOT: v_and_b32
 ; SCRATCH256K-NOT: v_and_b32
 ; SCRATCH1024K-NOT: v_and_b32
@@ -60,7 +60,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 1048575
   store volatile i32 %masked, ptr addrspace(1) undef
@@ -68,12 +68,12 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
 }
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; GCN-NOT: v_and_b32
 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
 define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %alloca
+  store volatile i32 15, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 2097151
   store volatile i32 %masked, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index 833aba9b26af..9e336a714ca6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -23,8 +23,8 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:     $sgpr17 = S_ADDC_U32 internal $sgpr17, target-flags(amdgpu-gotprel32-hi) @bar + 12, implicit-def $scc, implicit internal $scc
   ; CHECK-NEXT:   }
   ; CHECK-NEXT:   S_WAITCNT_VSCNT undef $sgpr_null, 0
-  ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
+  ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 823c444f8302..f736ca7cd625 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -969,7 +969,7 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    s_add_u32 s4, s4, s3
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_addc_u32 s5, s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s3, 3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -980,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:7
-; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:6
-; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:5
+; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
-; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:5
-; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
-; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:7
+; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
+; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index 5873e9c85763..6f611791af66 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -4,7 +4,7 @@
 ; alignment of the stack
 
 ; CHECK-LABEL: {{^}}no_args:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @no_args() {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -12,7 +12,7 @@ define amdgpu_kernel void @no_args() {
 }
 
 ; CHECK-LABEL: {{^}}force_align32:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -20,7 +20,7 @@ define amdgpu_kernel void @force_align32(<8 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align64:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -28,7 +28,7 @@ define amdgpu_kernel void @force_align64(<16 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align128:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
@@ -36,7 +36,7 @@ define amdgpu_kernel void @force_align128(<32 x i32>) {
 }
 
 ; CHECK-LABEL: {{^}}force_align256:
-; CHECK: ScratchSize: 5{{$}}
+; CHECK: ScratchSize: 8{{$}}
 define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8, addrspace(5)
   store volatile i8 0, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 13a8033bcc57..a209dcfe3a7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -444,7 +444,7 @@ main_body:
 ; for stack access.
 
 ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
-; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
 ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
 define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
   %alloca = alloca i32, addrspace(5)
@@ -455,7 +455,7 @@ define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
 }
 
 ; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
-; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}}
 ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
 ; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
 define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
index 91284d383867..cf324d62e1de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll
@@ -1,8 +1,11 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6  -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX12 %s
 
 ; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 7dc139ef9697..10e1ae3ecfcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx10.1-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
index e9d9b669408a..8598b78deccf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -3,7 +3,7 @@
 ; FIXME: Requires stack object to not assert
 ; GCN-LABEL: {{^}}test_ps:
 ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GCN: buffer_store_dword v0, off, s[4:7], 0{{$}}
 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: ; return
@@ -17,7 +17,7 @@ define amdgpu_ps i32 @test_ps() #1 {
 
 ; GCN-LABEL: {{^}}test_cs:
 ; GCN: s_mov_b64 s[4:5], s[0:1]
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0{{$}}
 ; GCN: s_load_dword s0, s[0:1], 0x0
 define amdgpu_cs i32 @test_cs() #1 {
   %alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index e789db19a673..0284f44f5f14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -58,7 +58,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %o
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint ptr addrspace(5) %alloca to i32
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index b288535f7087..21e27bfa7553 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3478,19 +3478,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
@@ -3562,17 +3562,17 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -3801,20 +3801,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
@@ -3885,16 +3885,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -4289,11 +4289,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
@@ -4370,10 +4370,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -4602,20 +4602,20 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v13
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 16, v12
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v11
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v11, 0, 16
@@ -4686,16 +4686,16 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -7270,11 +7270,11 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v3
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
@@ -7334,16 +7334,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
@@ -7363,10 +7363,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index af96165b7d44..0f9cc33d731f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3031,11 +3031,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
-; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
@@ -3090,10 +3090,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
-; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
@@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
@@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 8018e0cbf1cc..0f4e790a6976 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -189,8 +189,8 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1)
 
 define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller0(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -225,8 +225,8 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller1(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -261,8 +261,8 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
 ; OPT-LABEL: @memcpy_multi_use_one_function(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
@@ -286,8 +286,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
 ; OPT:       post-loop-memcpy-expansion1:
-; OPT-NEXT:    [[TMP16:%.*]] = udiv i64 [[M:%.*]], 16
-; OPT-NEXT:    [[TMP17:%.*]] = urem i64 [[M]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = lshr i64 [[M:%.*]], 4
+; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M]], 15
 ; OPT-NEXT:    [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
 ; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], 0
 ; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -326,8 +326,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 
 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -363,8 +363,8 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; One of the uses in the function should be expanded, the other left alone.
 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -395,8 +395,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; ALL-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -909,8 +909,8 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 
 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -945,8 +945,8 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -981,8 +981,8 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1017,8 +1017,8 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1053,8 +1053,8 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1089,8 +1089,8 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1161,8 +1161,8 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 
 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1488,9 +1488,9 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1511,9 +1511,9 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1561,9 +1561,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1576,17 +1576,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 
 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META0:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1594,9 +1594,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
 ; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1607,17 +1607,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META9:![0-9]+]]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !9
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META9]]
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1625,9 +1625,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]]
 ; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !9
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1651,9 +1651,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope !12
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !12
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1666,17 +1666,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 
 define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META3:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !3
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META3]]
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1684,9 +1684,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]]
 ; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !3
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1697,17 +1697,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META15:![0-9]+]]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !15
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META15]]
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1715,9 +1715,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]]
 ; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
-; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !15
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1731,5 +1731,49 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
   ret void
 }
 
+define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
+; OPT-LABEL: @test_umin(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr [32 x [8 x i64]], ptr [[Y:%.*]], i64 0, i64 [[IDXPROM:%.*]]
+; OPT-NEXT:    [[SPEC_SELECT:%.*]] = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SPEC_SELECT]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SPEC_SELECT]], 15
+; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]]
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; OPT:       loop-memcpy-expansion:
+; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; OPT:       loop-memcpy-residual:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]]
+; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[TMP10]]
+; OPT-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    ret void
+; OPT:       loop-memcpy-residual-header:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+entry:
+  %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom
+  %spec.select =  tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr %x, i64 %spec.select, i1 false)
+  ret void
+}
+
+declare i64 @llvm.umin.i64(i64, i64)
+
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index e8b9526774d8..b520dd1060ec 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -2,12 +2,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic -verify-machineinstrs --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,SDAG-GFX9GEN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
 
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GEN,GISEL-GFX9GEN %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
 
@@ -30,6 +32,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,6 +90,15 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -134,6 +154,15 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -193,6 +222,19 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v3, v5
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v4, v6
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -242,6 +284,19 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v5
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v4, v6
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -300,6 +355,19 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v2, v4, v1
+; GFX9GEN-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_v2f32_shuffle:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,6 +446,15 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,6 +470,15 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e64 v3, -v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -437,6 +523,15 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -487,6 +582,15 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, -|v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,6 +642,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -584,6 +696,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,6 +751,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, |v2|
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -678,6 +806,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, -|v2|
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -734,6 +870,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, 1.0
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -806,6 +950,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, 0.15915494
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -885,6 +1037,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v1, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -921,6 +1081,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v0, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -969,6 +1138,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v1, 0x367c0000
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1005,6 +1182,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v0, 0x367c0000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1058,6 +1244,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, v0, v3, 1.0
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v1, v2, v1, 1.0
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1112,6 +1309,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v0, v2, v0, 1.0
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v1, v3, v1, 1.0
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1168,6 +1376,18 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    v_madak_f32 v0, v0, v3, 0x3e230000
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v2, v4
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1224,6 +1444,18 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_madak_f32 v0, v2, v0, 0x3e230000
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v1, v3, v4
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1283,6 +1515,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, v0, v3, 0.15915494
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v1, v2, v1, 0.15915494
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1338,6 +1581,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v0, v2, v0, 0.15915494
+; GISEL-GFX9GEN-NEXT:    v_mad_f32 v1, v3, v1, 0.15915494
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1386,6 +1640,15 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1440,6 +1703,12 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
 ; GFX906-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: no_mix_simple:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: no_mix_simple:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1474,6 +1743,12 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
 ; GFX906-NEXT:    v_fma_f32 v0, |v0|, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: no_mix_simple_fabs:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: no_mix_simple_fabs:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1516,6 +1791,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1568,6 +1852,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1628,6 +1920,16 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9GEN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1691,6 +1993,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9GEN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1741,6 +2052,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1791,6 +2111,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1838,6 +2166,15 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1854,6 +2191,15 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e64 v3, -v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1910,6 +2256,15 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9GEN-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1966,6 +2321,15 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9GEN-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; GFX9GEN:       ; %bb.0:
+; GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2021,6 +2385,15 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2038,6 +2411,16 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2087,6 +2470,15 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; SDAG-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2104,6 +2496,16 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,6 +2555,15 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX9GEN:       ; %bb.0:
+; SDAG-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-GFX9GEN-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2170,6 +2581,16 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX9GEN:       ; %bb.0:
+; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX9GEN-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-GFX9GEN-NEXT:    v_cvt_f32_f16_e32 v0, v2
+; GISEL-GFX9GEN-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-GFX9GEN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 8cf70eaf29e8..1dd18b4228fe 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -59,8 +59,9 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
-; CHECK-NEXT:    s_and_b32 s4, 32, 15
-; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 89ccf97c3f4b..77962fadcacf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1224,14 +1224,14 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acquire_fence:
@@ -1260,14 +1260,14 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") acquire
@@ -1338,14 +1338,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence:
@@ -1376,14 +1376,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") acq_rel
@@ -1403,14 +1403,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence:
@@ -1441,14 +1441,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") seq_cst
@@ -1468,14 +1468,14 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence:
@@ -1504,14 +1504,14 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acquire
@@ -1582,14 +1582,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence:
@@ -1620,14 +1620,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acq_rel
@@ -1647,14 +1647,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence:
@@ -1685,14 +1685,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") seq_cst
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: system_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acquire_fence:
@@ -1750,14 +1750,14 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: system_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence acquire
@@ -1830,14 +1830,14 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: system_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence:
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: system_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence acq_rel
@@ -1899,14 +1899,14 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: system_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence:
@@ -1941,14 +1941,14 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: system_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence seq_cst
@@ -1968,14 +1968,14 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence:
@@ -2006,14 +2006,14 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acquire
@@ -2086,14 +2086,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence:
@@ -2128,14 +2128,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acq_rel
@@ -2155,14 +2155,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence:
@@ -2197,14 +2197,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index d6c759f733b4..d9f5e64eae92 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -299,8 +299,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -314,8 +314,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -397,8 +397,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -410,8 +410,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -445,8 +445,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -460,8 +460,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -543,8 +543,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -556,8 +556,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
@@ -1184,8 +1184,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw:
@@ -1259,8 +1259,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw:
@@ -1274,8 +1274,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1425,8 +1425,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1441,8 +1441,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1518,8 +1518,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw:
@@ -1533,8 +1533,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1568,8 +1568,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1584,8 +1584,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1661,8 +1661,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw:
@@ -1676,8 +1676,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1711,8 +1711,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1727,8 +1727,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1822,8 +1822,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1859,8 +1859,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1875,8 +1875,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1957,8 +1957,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1972,8 +1972,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2009,8 +2009,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2025,8 +2025,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2107,8 +2107,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -2122,8 +2122,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2275,8 +2275,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2292,8 +2292,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2363,8 +2363,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
@@ -2376,8 +2376,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2530,8 +2530,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2547,8 +2547,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2620,8 +2620,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
@@ -2633,8 +2633,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2672,8 +2672,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2689,8 +2689,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2762,8 +2762,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
@@ -2775,8 +2775,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2814,8 +2814,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2831,8 +2831,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2902,8 +2902,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
@@ -2915,8 +2915,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2954,8 +2954,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -2971,8 +2971,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -3042,8 +3042,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
@@ -3055,8 +3055,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3094,8 +3094,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3111,8 +3111,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3184,8 +3184,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg:
@@ -3197,8 +3197,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3236,8 +3236,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3253,8 +3253,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3326,8 +3326,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
@@ -3339,8 +3339,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3378,8 +3378,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3395,8 +3395,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3468,8 +3468,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
@@ -3481,8 +3481,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3520,8 +3520,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3537,8 +3537,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3610,8 +3610,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
@@ -3623,8 +3623,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3662,8 +3662,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3679,8 +3679,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3752,8 +3752,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
@@ -3765,8 +3765,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3804,8 +3804,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3821,8 +3821,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3894,8 +3894,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
@@ -3907,8 +3907,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3946,8 +3946,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -3963,8 +3963,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -4036,8 +4036,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
@@ -4049,8 +4049,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4088,8 +4088,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4105,8 +4105,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
@@ -4191,8 +4191,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4375,8 +4375,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4474,8 +4474,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4487,8 +4487,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4676,8 +4676,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4695,8 +4695,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4777,8 +4777,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4790,8 +4790,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4834,8 +4834,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4853,8 +4853,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4935,8 +4935,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4948,8 +4948,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4992,8 +4992,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5011,8 +5011,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5091,8 +5091,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5104,8 +5104,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5148,8 +5148,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5167,8 +5167,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5247,8 +5247,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5260,8 +5260,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5304,8 +5304,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5323,8 +5323,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5405,8 +5405,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5418,8 +5418,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5462,8 +5462,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5481,8 +5481,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5563,8 +5563,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5576,8 +5576,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5620,8 +5620,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5639,8 +5639,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5721,8 +5721,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5734,8 +5734,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5778,8 +5778,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5797,8 +5797,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5879,8 +5879,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5892,8 +5892,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5936,8 +5936,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5955,8 +5955,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6037,8 +6037,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6050,8 +6050,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6094,8 +6094,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6113,8 +6113,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6195,8 +6195,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6208,8 +6208,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6252,8 +6252,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6271,8 +6271,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6353,8 +6353,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6366,8 +6366,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6410,8 +6410,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6429,8 +6429,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6511,8 +6511,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6524,8 +6524,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6827,8 +6827,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6843,8 +6843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6930,8 +6930,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -6944,8 +6944,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -6981,8 +6981,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6997,8 +6997,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7084,8 +7084,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7098,8 +7098,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7710,8 +7710,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7725,8 +7725,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7799,8 +7799,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
@@ -7813,8 +7813,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -7963,8 +7963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -7978,8 +7978,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -8054,8 +8054,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
@@ -8068,8 +8068,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8102,8 +8102,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8117,8 +8117,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8193,8 +8193,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
@@ -8207,8 +8207,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8243,8 +8243,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8260,8 +8260,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8343,8 +8343,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8359,8 +8359,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8398,8 +8398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8415,8 +8415,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8500,8 +8500,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8516,8 +8516,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8555,8 +8555,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8572,8 +8572,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8657,8 +8657,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8673,8 +8673,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8826,8 +8826,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8842,8 +8842,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8912,8 +8912,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
@@ -8924,8 +8924,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9077,8 +9077,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9093,8 +9093,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9165,8 +9165,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9177,8 +9177,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9215,8 +9215,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9231,8 +9231,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9303,8 +9303,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9315,8 +9315,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9353,8 +9353,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9369,8 +9369,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9439,8 +9439,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9451,8 +9451,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9489,8 +9489,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9505,8 +9505,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9575,8 +9575,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
@@ -9587,8 +9587,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9625,8 +9625,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9641,8 +9641,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9713,8 +9713,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
@@ -9725,8 +9725,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9763,8 +9763,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9779,8 +9779,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9851,8 +9851,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -9863,8 +9863,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -9917,8 +9917,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -9989,8 +9989,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10001,8 +10001,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10039,8 +10039,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10055,8 +10055,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10127,8 +10127,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10139,8 +10139,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10177,8 +10177,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10193,8 +10193,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10265,8 +10265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10277,8 +10277,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10331,8 +10331,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10403,8 +10403,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
@@ -10415,8 +10415,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10453,8 +10453,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10469,8 +10469,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10541,8 +10541,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10553,8 +10553,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10591,8 +10591,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10607,8 +10607,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10679,8 +10679,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10691,8 +10691,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10876,8 +10876,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10896,8 +10896,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10980,8 +10980,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -10994,8 +10994,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11185,8 +11185,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11205,8 +11205,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11291,8 +11291,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11305,8 +11305,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11351,8 +11351,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11371,8 +11371,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11457,8 +11457,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11471,8 +11471,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11517,8 +11517,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11537,8 +11537,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11621,8 +11621,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11635,8 +11635,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11681,8 +11681,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11701,8 +11701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11785,8 +11785,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11799,8 +11799,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11845,8 +11845,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11865,8 +11865,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11951,8 +11951,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11965,8 +11965,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12011,8 +12011,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12031,8 +12031,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12117,8 +12117,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12131,8 +12131,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12177,8 +12177,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12197,8 +12197,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12283,8 +12283,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12297,8 +12297,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12343,8 +12343,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12363,8 +12363,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12449,8 +12449,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12463,8 +12463,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12509,8 +12509,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12529,8 +12529,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12615,8 +12615,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12629,8 +12629,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12675,8 +12675,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12695,8 +12695,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12781,8 +12781,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12795,8 +12795,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12841,8 +12841,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12861,8 +12861,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12947,8 +12947,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12961,8 +12961,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13007,8 +13007,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13027,8 +13027,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13113,8 +13113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13127,8 +13127,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 461fe8b1296a..372f9ada5839 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -299,8 +299,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -314,8 +314,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -399,8 +399,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -412,8 +412,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -447,8 +447,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -462,8 +462,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -547,8 +547,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -560,8 +560,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -1176,8 +1176,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
@@ -1192,8 +1192,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw:
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw:
@@ -1284,8 +1284,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1437,8 +1437,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1453,8 +1453,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1534,8 +1534,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw:
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1584,8 +1584,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1600,8 +1600,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1681,8 +1681,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw:
@@ -1696,8 +1696,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -1731,8 +1731,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1747,8 +1747,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1829,8 +1829,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1844,8 +1844,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1881,8 +1881,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1897,8 +1897,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1983,8 +1983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1998,8 +1998,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2035,8 +2035,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2051,8 +2051,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2137,8 +2137,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -2152,8 +2152,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2305,8 +2305,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2322,8 +2322,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2395,8 +2395,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
@@ -2408,8 +2408,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2564,8 +2564,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2581,8 +2581,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2658,8 +2658,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
@@ -2671,8 +2671,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2710,8 +2710,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2727,8 +2727,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2804,8 +2804,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
@@ -2817,8 +2817,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2856,8 +2856,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2873,8 +2873,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2946,8 +2946,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
@@ -2959,8 +2959,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -2998,8 +2998,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3015,8 +3015,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3088,8 +3088,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
@@ -3101,8 +3101,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3140,8 +3140,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3157,8 +3157,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3234,8 +3234,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg:
@@ -3247,8 +3247,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3286,8 +3286,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3303,8 +3303,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3380,8 +3380,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
@@ -3393,8 +3393,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3432,8 +3432,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3449,8 +3449,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3526,8 +3526,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
@@ -3539,8 +3539,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3578,8 +3578,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3595,8 +3595,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3672,8 +3672,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
@@ -3685,8 +3685,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3724,8 +3724,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3741,8 +3741,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3818,8 +3818,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
@@ -3831,8 +3831,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3887,8 +3887,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3964,8 +3964,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
@@ -3977,8 +3977,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4016,8 +4016,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4033,8 +4033,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4110,8 +4110,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
@@ -4123,8 +4123,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4162,8 +4162,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4179,8 +4179,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4256,8 +4256,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
@@ -4269,8 +4269,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -4453,8 +4453,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4472,8 +4472,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4554,8 +4554,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4567,8 +4567,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4758,8 +4758,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4777,8 +4777,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -4863,8 +4863,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -4876,8 +4876,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4920,8 +4920,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -4939,8 +4939,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5025,8 +5025,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5082,8 +5082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5101,8 +5101,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5183,8 +5183,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5196,8 +5196,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5240,8 +5240,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5259,8 +5259,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5341,8 +5341,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5354,8 +5354,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5398,8 +5398,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5417,8 +5417,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5503,8 +5503,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5516,8 +5516,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5560,8 +5560,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5579,8 +5579,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5665,8 +5665,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5678,8 +5678,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5722,8 +5722,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5741,8 +5741,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5827,8 +5827,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -5840,8 +5840,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5884,8 +5884,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -5903,8 +5903,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -5989,8 +5989,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6002,8 +6002,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6046,8 +6046,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6065,8 +6065,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6151,8 +6151,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6164,8 +6164,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6208,8 +6208,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6227,8 +6227,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6313,8 +6313,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6326,8 +6326,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6370,8 +6370,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6389,8 +6389,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6475,8 +6475,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6488,8 +6488,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6532,8 +6532,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
@@ -6551,8 +6551,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
@@ -6637,8 +6637,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -6650,8 +6650,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6953,8 +6953,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6969,8 +6969,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7058,8 +7058,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7072,8 +7072,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7109,8 +7109,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7125,8 +7125,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7214,8 +7214,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
@@ -7228,8 +7228,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
@@ -7844,8 +7844,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7859,8 +7859,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7935,8 +7935,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
@@ -7949,8 +7949,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8101,8 +8101,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8116,8 +8116,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8196,8 +8196,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
@@ -8210,8 +8210,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8244,8 +8244,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8259,8 +8259,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8339,8 +8339,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
@@ -8353,8 +8353,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
 entry:
@@ -8389,8 +8389,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8406,8 +8406,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8491,8 +8491,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8507,8 +8507,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8546,8 +8546,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8563,8 +8563,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8652,8 +8652,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8668,8 +8668,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8707,8 +8707,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
@@ -8724,8 +8724,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
@@ -8813,8 +8813,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -8829,8 +8829,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -8982,8 +8982,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -8998,8 +8998,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -9070,8 +9070,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
@@ -9082,8 +9082,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9237,8 +9237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9253,8 +9253,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9329,8 +9329,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9341,8 +9341,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9379,8 +9379,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9395,8 +9395,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9471,8 +9471,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9483,8 +9483,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9521,8 +9521,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9537,8 +9537,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9609,8 +9609,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
@@ -9621,8 +9621,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9659,8 +9659,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9675,8 +9675,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9747,8 +9747,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
@@ -9759,8 +9759,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9797,8 +9797,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9813,8 +9813,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9889,8 +9889,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -9939,8 +9939,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9955,8 +9955,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -10031,8 +10031,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
@@ -10043,8 +10043,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10081,8 +10081,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10097,8 +10097,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10173,8 +10173,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
@@ -10185,8 +10185,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10223,8 +10223,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10239,8 +10239,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10327,8 +10327,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10365,8 +10365,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10381,8 +10381,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10457,8 +10457,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10469,8 +10469,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10507,8 +10507,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10523,8 +10523,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10599,8 +10599,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
@@ -10611,8 +10611,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10649,8 +10649,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10665,8 +10665,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10741,8 +10741,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10753,8 +10753,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -10791,8 +10791,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10807,8 +10807,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10883,8 +10883,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10895,8 +10895,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
 entry:
@@ -11080,8 +11080,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11100,8 +11100,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11186,8 +11186,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11200,8 +11200,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11393,8 +11393,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11413,8 +11413,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11503,8 +11503,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11517,8 +11517,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11563,8 +11563,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11583,8 +11583,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11673,8 +11673,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11687,8 +11687,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11733,8 +11733,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11753,8 +11753,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11839,8 +11839,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -11853,8 +11853,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -11899,8 +11899,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11919,8 +11919,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12005,8 +12005,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12019,8 +12019,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12065,8 +12065,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12085,8 +12085,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12175,8 +12175,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12189,8 +12189,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12235,8 +12235,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12255,8 +12255,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12345,8 +12345,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12359,8 +12359,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12405,8 +12405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12425,8 +12425,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12515,8 +12515,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12529,8 +12529,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12575,8 +12575,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12595,8 +12595,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12685,8 +12685,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12699,8 +12699,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12745,8 +12745,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12765,8 +12765,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12855,8 +12855,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -12869,8 +12869,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -12915,8 +12915,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12935,8 +12935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13025,8 +13025,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13039,8 +13039,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13085,8 +13085,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13105,8 +13105,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13195,8 +13195,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13209,8 +13209,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
@@ -13255,8 +13255,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13275,8 +13275,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13365,8 +13365,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
@@ -13379,8 +13379,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 0a7a07e36144..0449e4168296 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -317,8 +317,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -329,8 +329,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -400,8 +400,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -414,8 +414,8 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -465,8 +465,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -477,8 +477,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -548,8 +548,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -562,8 +562,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1241,8 +1241,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_atomicrmw:
@@ -1255,8 +1255,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw:
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_atomicrmw:
@@ -1341,8 +1341,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1514,8 +1514,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1528,8 +1528,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1603,8 +1603,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw:
@@ -1616,8 +1616,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1662,8 +1662,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1676,8 +1676,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1751,8 +1751,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw:
@@ -1764,8 +1764,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1812,8 +1812,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1827,8 +1827,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1906,8 +1906,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1922,8 +1922,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1974,8 +1974,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1989,8 +1989,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2070,8 +2070,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2086,8 +2086,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2138,8 +2138,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2153,8 +2153,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2234,8 +2234,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2250,8 +2250,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2427,8 +2427,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2440,8 +2440,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2510,8 +2510,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
@@ -2522,8 +2522,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2698,8 +2698,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2711,8 +2711,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2783,8 +2783,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
@@ -2795,8 +2795,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2845,8 +2845,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2858,8 +2858,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2930,8 +2930,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
@@ -2942,8 +2942,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2992,8 +2992,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3005,8 +3005,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3075,8 +3075,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
@@ -3087,8 +3087,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3137,8 +3137,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3150,8 +3150,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3220,8 +3220,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
@@ -3232,8 +3232,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3282,8 +3282,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3295,8 +3295,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3367,8 +3367,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg:
@@ -3379,8 +3379,8 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3429,8 +3429,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3442,8 +3442,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3514,8 +3514,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
@@ -3526,8 +3526,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3576,8 +3576,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3589,8 +3589,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3661,8 +3661,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
@@ -3673,8 +3673,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3723,8 +3723,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3736,8 +3736,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3808,8 +3808,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
@@ -3820,8 +3820,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3883,8 +3883,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3955,8 +3955,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
@@ -3967,8 +3967,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4017,8 +4017,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4030,8 +4030,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4102,8 +4102,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
@@ -4114,8 +4114,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4164,8 +4164,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4177,8 +4177,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
@@ -4261,8 +4261,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4311,8 +4311,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4324,8 +4324,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4396,8 +4396,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
@@ -4408,8 +4408,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4612,8 +4612,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4626,8 +4626,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4702,8 +4702,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4717,8 +4717,8 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4928,8 +4928,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4942,8 +4942,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5020,8 +5020,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5035,8 +5035,8 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5108,8 +5108,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5186,8 +5186,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5201,8 +5201,8 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5260,8 +5260,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5274,8 +5274,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5350,8 +5350,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5365,8 +5365,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5424,8 +5424,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5438,8 +5438,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5514,8 +5514,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5529,8 +5529,8 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5588,8 +5588,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5602,8 +5602,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5680,8 +5680,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5695,8 +5695,8 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5754,8 +5754,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5768,8 +5768,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5846,8 +5846,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5861,8 +5861,8 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5920,8 +5920,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5934,8 +5934,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6012,8 +6012,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6027,8 +6027,8 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6086,8 +6086,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6100,8 +6100,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6178,8 +6178,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6193,8 +6193,8 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6252,8 +6252,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6266,8 +6266,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6344,8 +6344,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6359,8 +6359,8 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6418,8 +6418,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6432,8 +6432,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6510,8 +6510,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6525,8 +6525,8 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6584,8 +6584,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6598,8 +6598,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6676,8 +6676,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6691,8 +6691,8 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6750,8 +6750,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6764,8 +6764,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6842,8 +6842,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6857,8 +6857,8 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7178,8 +7178,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -7190,8 +7190,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7261,8 +7261,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7275,8 +7275,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7326,8 +7326,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -7338,8 +7338,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7409,8 +7409,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7423,8 +7423,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8102,8 +8102,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8116,8 +8116,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8189,8 +8189,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
@@ -8202,8 +8202,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8375,8 +8375,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8389,8 +8389,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8464,8 +8464,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
@@ -8477,8 +8477,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8523,8 +8523,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8537,8 +8537,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8612,8 +8612,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
@@ -8625,8 +8625,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8673,8 +8673,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8688,8 +8688,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8767,8 +8767,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8783,8 +8783,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8835,8 +8835,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8850,8 +8850,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8931,8 +8931,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8947,8 +8947,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8999,8 +8999,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -9014,8 +9014,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9095,8 +9095,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9111,8 +9111,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9288,8 +9288,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9301,8 +9301,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9371,8 +9371,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
@@ -9383,8 +9383,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9559,8 +9559,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9572,8 +9572,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9644,8 +9644,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
@@ -9656,8 +9656,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9706,8 +9706,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9719,8 +9719,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9791,8 +9791,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
@@ -9803,8 +9803,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9853,8 +9853,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9866,8 +9866,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9936,8 +9936,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
@@ -9948,8 +9948,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9998,8 +9998,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10011,8 +10011,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10081,8 +10081,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
@@ -10093,8 +10093,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10143,8 +10143,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10156,8 +10156,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10228,8 +10228,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
@@ -10240,8 +10240,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10290,8 +10290,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10303,8 +10303,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10375,8 +10375,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
@@ -10387,8 +10387,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10437,8 +10437,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10450,8 +10450,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10522,8 +10522,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
@@ -10534,8 +10534,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10584,8 +10584,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10597,8 +10597,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10669,8 +10669,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
@@ -10681,8 +10681,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10731,8 +10731,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10744,8 +10744,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10816,8 +10816,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
@@ -10828,8 +10828,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10878,8 +10878,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10891,8 +10891,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10963,8 +10963,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
@@ -10975,8 +10975,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11025,8 +11025,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11038,8 +11038,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11110,8 +11110,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
@@ -11122,8 +11122,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11172,8 +11172,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11185,8 +11185,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11257,8 +11257,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
@@ -11269,8 +11269,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11473,8 +11473,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11487,8 +11487,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11563,8 +11563,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11578,8 +11578,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11637,8 +11637,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11651,8 +11651,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11729,8 +11729,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11744,8 +11744,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11803,8 +11803,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11817,8 +11817,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11895,8 +11895,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11910,8 +11910,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11969,8 +11969,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11983,8 +11983,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12059,8 +12059,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12074,8 +12074,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12133,8 +12133,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12147,8 +12147,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12223,8 +12223,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12238,8 +12238,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12297,8 +12297,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12311,8 +12311,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12389,8 +12389,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12404,8 +12404,8 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12463,8 +12463,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12477,8 +12477,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12555,8 +12555,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12570,8 +12570,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12629,8 +12629,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12643,8 +12643,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12721,8 +12721,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12736,8 +12736,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12795,8 +12795,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12809,8 +12809,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12887,8 +12887,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12902,8 +12902,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12961,8 +12961,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12975,8 +12975,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13053,8 +13053,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13068,8 +13068,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13127,8 +13127,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13141,8 +13141,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13219,8 +13219,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13234,8 +13234,8 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13293,8 +13293,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13307,8 +13307,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13385,8 +13385,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13400,8 +13400,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13459,8 +13459,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13473,8 +13473,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13551,8 +13551,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13566,8 +13566,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 97b2f0813e5e..f611003a7ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -317,8 +317,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -329,8 +329,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -402,8 +402,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -416,8 +416,8 @@ define amdgpu_kernel void @global_system_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -467,8 +467,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -479,8 +479,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -552,8 +552,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -566,8 +566,8 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1249,8 +1249,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
@@ -1263,8 +1263,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
@@ -1338,8 +1338,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_atomicrmw:
@@ -1351,8 +1351,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1526,8 +1526,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
@@ -1540,8 +1540,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
@@ -1619,8 +1619,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw:
@@ -1632,8 +1632,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1678,8 +1678,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
@@ -1692,8 +1692,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
@@ -1771,8 +1771,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw:
@@ -1784,8 +1784,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1832,8 +1832,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1847,8 +1847,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1928,8 +1928,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1944,8 +1944,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1996,8 +1996,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2011,8 +2011,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2096,8 +2096,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2112,8 +2112,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2164,8 +2164,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -2179,8 +2179,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2264,8 +2264,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2280,8 +2280,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2457,8 +2457,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2470,8 +2470,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2542,8 +2542,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
@@ -2554,8 +2554,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2732,8 +2732,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2745,8 +2745,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2821,8 +2821,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
@@ -2833,8 +2833,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2883,8 +2883,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2896,8 +2896,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2972,8 +2972,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
@@ -2984,8 +2984,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3034,8 +3034,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3047,8 +3047,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3119,8 +3119,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
@@ -3131,8 +3131,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3181,8 +3181,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3194,8 +3194,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3266,8 +3266,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
@@ -3278,8 +3278,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3328,8 +3328,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
@@ -3341,8 +3341,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
@@ -3417,8 +3417,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
@@ -3429,8 +3429,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3479,8 +3479,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3492,8 +3492,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3568,8 +3568,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
@@ -3580,8 +3580,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3630,8 +3630,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3643,8 +3643,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3719,8 +3719,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
@@ -3731,8 +3731,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3781,8 +3781,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3794,8 +3794,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3870,8 +3870,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
@@ -3882,8 +3882,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4086,8 +4086,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4100,8 +4100,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4193,8 +4193,8 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4252,8 +4252,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4266,8 +4266,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4348,8 +4348,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4363,8 +4363,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4422,8 +4422,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4436,8 +4436,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4518,8 +4518,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4533,8 +4533,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4592,8 +4592,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4606,8 +4606,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4684,8 +4684,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4699,8 +4699,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4758,8 +4758,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4772,8 +4772,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -4850,8 +4850,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4865,8 +4865,8 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4924,8 +4924,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -4938,8 +4938,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5020,8 +5020,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5035,8 +5035,8 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5108,8 +5108,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5190,8 +5190,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5205,8 +5205,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5264,8 +5264,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5278,8 +5278,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5360,8 +5360,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5375,8 +5375,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5434,8 +5434,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5448,8 +5448,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5530,8 +5530,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5545,8 +5545,8 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5604,8 +5604,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5618,8 +5618,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5700,8 +5700,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5715,8 +5715,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5774,8 +5774,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5788,8 +5788,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5870,8 +5870,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5885,8 +5885,8 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5944,8 +5944,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -5958,8 +5958,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6040,8 +6040,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6055,8 +6055,8 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6114,8 +6114,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6128,8 +6128,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6210,8 +6210,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6225,8 +6225,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6546,8 +6546,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6558,8 +6558,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6631,8 +6631,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6645,8 +6645,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6696,8 +6696,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -6708,8 +6708,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6781,8 +6781,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6795,8 +6795,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7478,8 +7478,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7492,8 +7492,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7567,8 +7567,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw:
@@ -7580,8 +7580,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -7755,8 +7755,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7769,8 +7769,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7848,8 +7848,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
@@ -7861,8 +7861,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -7907,8 +7907,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -7921,8 +7921,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -8000,8 +8000,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
@@ -8013,8 +8013,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8061,8 +8061,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8076,8 +8076,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8157,8 +8157,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8173,8 +8173,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8225,8 +8225,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8240,8 +8240,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8325,8 +8325,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8341,8 +8341,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8393,8 +8393,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -8408,8 +8408,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8493,8 +8493,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8509,8 +8509,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8686,8 +8686,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8699,8 +8699,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8771,8 +8771,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
@@ -8783,8 +8783,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -8961,8 +8961,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -8974,8 +8974,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9050,8 +9050,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
@@ -9062,8 +9062,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9112,8 +9112,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9125,8 +9125,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9201,8 +9201,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
@@ -9213,8 +9213,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9263,8 +9263,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9276,8 +9276,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9348,8 +9348,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
@@ -9360,8 +9360,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9410,8 +9410,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9423,8 +9423,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9495,8 +9495,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
@@ -9507,8 +9507,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9557,8 +9557,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9570,8 +9570,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9646,8 +9646,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
@@ -9658,8 +9658,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9708,8 +9708,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9721,8 +9721,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9797,8 +9797,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
@@ -9809,8 +9809,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9859,8 +9859,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9872,8 +9872,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9948,8 +9948,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
@@ -9960,8 +9960,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10010,8 +10010,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10023,8 +10023,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10099,8 +10099,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
@@ -10111,8 +10111,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10161,8 +10161,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10174,8 +10174,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10250,8 +10250,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
@@ -10262,8 +10262,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10312,8 +10312,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10325,8 +10325,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10401,8 +10401,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
@@ -10413,8 +10413,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10463,8 +10463,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10476,8 +10476,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10552,8 +10552,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
@@ -10564,8 +10564,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10614,8 +10614,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10627,8 +10627,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10703,8 +10703,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
@@ -10715,8 +10715,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10919,8 +10919,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -10933,8 +10933,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11011,8 +11011,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11026,8 +11026,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11239,8 +11239,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11253,8 +11253,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11335,8 +11335,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11350,8 +11350,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11409,8 +11409,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11423,8 +11423,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11505,8 +11505,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11520,8 +11520,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11579,8 +11579,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11593,8 +11593,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11671,8 +11671,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11686,8 +11686,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11745,8 +11745,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11759,8 +11759,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11837,8 +11837,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11852,8 +11852,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -11911,8 +11911,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -11925,8 +11925,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12007,8 +12007,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12022,8 +12022,8 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12081,8 +12081,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12095,8 +12095,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12177,8 +12177,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12192,8 +12192,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12251,8 +12251,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12265,8 +12265,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12347,8 +12347,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12362,8 +12362,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12421,8 +12421,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12435,8 +12435,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12517,8 +12517,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12532,8 +12532,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12591,8 +12591,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12605,8 +12605,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12687,8 +12687,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12702,8 +12702,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12761,8 +12761,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12775,8 +12775,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12857,8 +12857,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12872,8 +12872,8 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -12931,8 +12931,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -12945,8 +12945,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13027,8 +13027,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13042,8 +13042,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13101,8 +13101,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -13115,8 +13115,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13197,8 +13197,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-WGP-NEXT:    s_nop 0
 ; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -13212,8 +13212,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-CU-NEXT:    s_nop 0
 ; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index ee2c59092fb6..940287d44d8d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -390,7 +390,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_brev_b32 s0, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -401,7 +401,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
+; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -421,11 +421,11 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    s_brev_b32 s8, 1
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
-; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s11
@@ -460,15 +460,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
 ; GCN-NEXT:    s_add_u32 s4, s4, s3
 ; GCN-NEXT:    s_addc_u32 s5, s5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40d00000
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:8
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-NEXT:    exp mrt0 v0, off, off, off done vm
@@ -482,15 +482,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40d00000
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-SCRATCH-NEXT:    scratch_store_dword off, v1, off offset:8
+; GCN-SCRATCH-NEXT:    scratch_store_dword off, v1, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
-; GCN-SCRATCH-NEXT:    scratch_load_dword v0, off, off offset:4
-; GCN-SCRATCH-NEXT:    scratch_load_dword v1, off, off offset:8
+; GCN-SCRATCH-NEXT:    scratch_load_dword v0, off, off
+; GCN-SCRATCH-NEXT:    scratch_load_dword v1, off, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-SCRATCH-NEXT:    exp mrt0 v0, off, off, off done vm
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 301f971a6fb7..4ba5f3abcb24 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -137,11 +137,11 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; so a possibly negative base index can't be used for the vgpr offset.
 
 ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr:
-; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4
+; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 0
 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]]
 ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
-; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4,
+; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 0,
 ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 75da11bf9a09..45fbaaabc65b 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -39,7 +39,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -86,7 +86,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index d6d559bd8119..d898a13cc619 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -110,7 +110,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v2, s10, 62
 ; GCN-NEXT:    v_writelane_b32 v2, s11, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -201,7 +201,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v1, s10, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s11, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -215,7 +215,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_writelane_b32 v0, s10, 6
 ; GCN-NEXT:    v_writelane_b32 v0, s11, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -223,10 +223,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s8, v2, 56
@@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
@@ -423,13 +423,13 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr2
 ; GCN-NEXT:    ; kill: killed $vgpr1
@@ -570,7 +570,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
@@ -589,7 +589,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 8
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 9
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -597,10 +597,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s16, v1, 8
@@ -698,10 +698,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB1_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
@@ -747,10 +747,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -840,7 +840,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
@@ -849,7 +849,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -857,7 +857,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v1, 32
@@ -909,7 +909,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v1, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
@@ -947,10 +947,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB2_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
@@ -999,10 +999,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -1092,7 +1092,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_writelane_b32 v1, s18, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s19, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[2:3]
@@ -1101,7 +1101,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_writelane_b32 v0, s2, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s3, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1109,7 +1109,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v2, 32
@@ -1161,7 +1161,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v2, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
@@ -1205,10 +1205,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB3_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
index d3493391863b..8e2a56b463c4 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
@@ -36,12 +36,12 @@ body:             |
     ; GCN-LABEL: name: preserve_active_lanes_above_args
     ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
     $vgpr8 = COPY renamable killed $vgpr10
@@ -70,8 +70,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -81,8 +81,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
@@ -142,8 +142,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -152,8 +152,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
index ae920f9a5755..765597fecd20 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
@@ -38,14 +38,14 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
@@ -72,8 +72,8 @@ body:             |
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
@@ -86,8 +86,8 @@ body:             |
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 75feac35dacd..a70488a00db7 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,8 +1,11 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s
 
 ; GCN: preload_kernarg_header
-; GCN-COUNT-64: s_nop 0
+; HSA: s_trap 2
+; NON-HSA: s_endpgm
+; GCN-COUNT-63: s_nop 0
 define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
     store ptr %arg, ptr %arg
     ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 57980214e58e..d20c3a4007ff 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,1856 +1,3681 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
 
-; Check alignment on the second preloaded arg.
 
 define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s6, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
-; NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
-; PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s2, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
   %add = add i32 %ext, %ext1
@@ -1859,3563 +3684,7068 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 }
 
 define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-; Don't try to preload byref args.
 
 define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; NO-PRELOAD-LABEL: byref_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: byref_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: byref_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: byref_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: byref_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
   store volatile i32 %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: Should do partial preload in cases like these where only part of the arg
-; can be preloaded.
 
 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v8i32_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i32_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i32_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i32_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i32_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i32_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i32_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; NO-PRELOAD-LABEL: v3f32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3f32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3f32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3f32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3f32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; NO-PRELOAD-LABEL: v5i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; NO-PRELOAD-LABEL: v5f64_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5f64_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5f64_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5f64_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5f64_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5f64_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5f64_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; NO-PRELOAD-LABEL: v8i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
-; NO-PRELOAD-LABEL: i64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
-; NO-PRELOAD-LABEL: f64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
new file mode 100644
index 000000000000..15f664c98182
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -0,0 +1,24 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX700
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+
+declare i64 @llvm.readsteadycounter() #0
+
+; GCN-LABEL: {{^}}test_readsteadycounter:
+; GFX700: s_mov_b32 s[[REG:[0-9]+]], 0
+; GFX900: s_memrealtime s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GFX900: s_memrealtime s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; GFX1100: s_sendmsg_rtn_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX1100: s_sendmsg_rtn_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], sendmsg(MSG_RTN_GET_REALTIME)
+define amdgpu_kernel void @test_readsteadycounter(ptr addrspace(1) %out) #0 {
+  %cycle0 = call i64 @llvm.readsteadycounter()
+  store volatile i64 %cycle0, ptr addrspace(1) %out
+
+  %cycle1 = call i64 @llvm.readsteadycounter()
+  store volatile i64 %cycle1, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
index 23c6afdca6fb..e4cbae66d47f 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
@@ -39,7 +39,7 @@ body:             |
     ; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
     ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -148,74 +148,74 @@ body:             |
     ; GFX908-NEXT: $vgpr35 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr35, implicit $exec, implicit $exec
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr10, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr12, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr13, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr16, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr17, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr20, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr21, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr22, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr23, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr24, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr25, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr26, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr27, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr28, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr29, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr30, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr31, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr34, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr35, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr36, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr37, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr38, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr39, implicit $exec, implicit $exec
-    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
     ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr40, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
-    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
@@ -287,39 +287,39 @@ body:             |
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec
     ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
-    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
-    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
-    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
-    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
-    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
-    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
-    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
-    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
-    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
-    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
-    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
-    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
-    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
-    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
-    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
-    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
-    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
-    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
-    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
-    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
-    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
-    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
-    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
-    ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
-    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
+    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
     ; GFX908-NEXT: S_NOP 0, implicit renamable $agpr0, implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit killed renamable $vgpr3, implicit killed renamable $vgpr4, implicit killed renamable $vgpr5, implicit killed renamable $vgpr6, implicit killed renamable $vgpr7, implicit killed renamable $vgpr8, implicit killed renamable $vgpr9, implicit killed renamable $vgpr10, implicit killed renamable $vgpr11, implicit killed renamable $vgpr12, implicit killed renamable $vgpr13, implicit killed renamable $vgpr14, implicit killed renamable $vgpr15, implicit killed renamable $vgpr16, implicit killed renamable $vgpr17, implicit killed renamable $vgpr18, implicit killed renamable $vgpr19, implicit killed renamable $vgpr20, implicit killed renamable $vgpr21, implicit killed renamable $vgpr22, implicit killed renamable $vgpr23, implicit killed renamable $vgpr24, implicit killed renamable $vgpr25, implicit killed renamable $vgpr26, implicit killed renamable $vgpr27, implicit killed renamable $vgpr28, implicit killed renamable $vgpr29, implicit killed renamable $vgpr30, implicit killed renamable $vgpr31, implicit killed renamable $vgpr32, implicit killed renamable $vgpr33, implicit killed renamable $vgpr34
     ; GFX908-NEXT: S_ENDPGM 0, implicit killed renamable $agpr0
     %v0:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 272daac3e03c..4cc469b9b49a 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -73,14 +73,14 @@
 ; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
 ; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
 
-; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
-; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
-; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
+; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x200, [[CLAMP_IDX]]
+; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]
+; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]
 
 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
-; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128
+; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 573fa7ac0e84..242ecd89b5c7 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -15,10 +15,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -106,7 +106,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_writelane_b32 v1, s22, 62
 ; GCN-NEXT:    v_writelane_b32 v1, s23, 63
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[6:7]
@@ -115,7 +115,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_writelane_b32 v0, s6, 0
 ; GCN-NEXT:    v_writelane_b32 v0, s7, 1
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -123,7 +123,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v1, 0
@@ -143,7 +143,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 15
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
@@ -213,10 +213,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  .LBB0_2: ; %ret
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ; kill: killed $vgpr1
 ; GCN-NEXT:    ; kill: killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index 059eb6dad2c3..f19b0a5f53e9 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -68,74 +68,74 @@ body:             |
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -145,12 +145,12 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -168,12 +168,12 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -207,16 +207,16 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GCN32-MUBUF-LABEL: name: check_spill
@@ -232,74 +232,74 @@ body:             |
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -309,12 +309,12 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -332,12 +332,12 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -371,16 +371,16 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ;
     ; GCN64-FLATSCR-LABEL: name: check_spill
@@ -392,74 +392,74 @@ body:             |
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
@@ -469,12 +469,12 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
@@ -492,12 +492,12 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
@@ -531,16 +531,16 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, align 4096, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     renamable $sgpr12 = IMPLICIT_DEF
     SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
@@ -626,52 +626,52 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr29 = S_ADDC_U32 $sgpr29, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -681,11 +681,11 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN64-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN64-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -703,11 +703,11 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN64-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN64-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -741,15 +741,15 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN64-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN64-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GCN32-MUBUF-LABEL: name: check_reload
@@ -764,52 +764,52 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -819,11 +819,11 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN32-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN32-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -841,11 +841,11 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN32-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN32-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -879,15 +879,15 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN32-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN32-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ;
     ; GCN64-FLATSCR-LABEL: name: check_reload
@@ -898,52 +898,52 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
     ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
     ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -953,11 +953,11 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
     ; GCN64-FLATSCR-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
     ; GCN64-FLATSCR-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -975,11 +975,11 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
     ; GCN64-FLATSCR-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
     ; GCN64-FLATSCR-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
@@ -1013,15 +1013,15 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
     ; GCN64-FLATSCR-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
     ; GCN64-FLATSCR-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, align 4096, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index e67b5e4a1f3c..c5a5a5209f54 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -8,7 +8,7 @@
 ; Make sure we are handling hazards correctly.
 ; SGPR: v_mov_b32_e32 v0, vcc_lo
 ; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1
-; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload
+; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload
 ; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]]
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 5871a787c983..c9413b61758d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -75,31 +75,31 @@ use:
 ; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a1 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a2 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a3 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
 
-; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
 
 ; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 
-; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
 ; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
 
-; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:16 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
 ; GFX90A:  global_store_dwordx4 v[0:1], v[2:5], off
 
 ; GCN: ScratchSize: 20
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 4a13a746805e..f192f25b2388 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -17,7 +17,7 @@
 ; TOVMEM: s_mov_b64 [[COPY_EXEC:s\[[0-9]+:[0-9]+\]]], exec
 ; TOVMEM: s_mov_b64 exec, 1
 ; TOVMEM: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill
 ; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]]
 
 ; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
@@ -26,7 +26,7 @@
 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]]
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 7ad3520376ac..baca66a287cb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -11,14 +11,14 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_endpgm
 ;
@@ -27,16 +27,16 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:8
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_endpgm
 entry:
@@ -277,19 +277,19 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; v[0:1]
@@ -301,16 +301,16 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 1458a937037b..bea2e6d4b45a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10315,8 +10315,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10351,8 +10351,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10387,8 +10387,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10431,8 +10431,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10449,8 +10449,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10463,8 +10463,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10494,8 +10494,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_or_b64 exec, exec, vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10509,8 +10509,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x80800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x80800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 3892ceb41895..537aca12bdb7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -50,28 +50,28 @@ body:             |
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX9-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX9-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GFX10-LABEL: name: check_vcc
@@ -87,28 +87,28 @@ body:             |
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GFX10-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX10-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ;
     ; GFX11-LABEL: name: check_vcc
@@ -118,28 +118,28 @@ body:             |
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
     ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; GFX11-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
     ; GFX11-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
-    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     $vcc = IMPLICIT_DEF
     SI_SPILL_S64_SAVE $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 37209335fc74..eb211f779fb3 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -56,6 +59,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -101,6 +107,8 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 128, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
   ret void
@@ -111,6 +119,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -119,7 +130,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:    .p2align 6
 ; VI-NEXT:    .amdhsa_kernel stackrealign_attr
 ; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
-; VI-NEXT:     .amdhsa_private_segment_fixed_size 8
+; VI-NEXT:     .amdhsa_private_segment_fixed_size 12
 ; VI-NEXT:     .amdhsa_kernarg_size 0
 ; VI-NEXT:     .amdhsa_user_sgpr_count 6
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -159,6 +170,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167,7 +181,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:    .p2align 6
 ; GFX9-NEXT:    .amdhsa_kernel stackrealign_attr
 ; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 8
+; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 12
 ; GFX9-NEXT:     .amdhsa_kernarg_size 0
 ; GFX9-NEXT:     .amdhsa_user_sgpr_count 6
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -204,6 +218,8 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
   ret void
@@ -214,6 +230,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -262,6 +281,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -307,6 +329,8 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+  %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
+  store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
   store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
index b1a939d7aa99..e378a83cff50 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@@ -3,8 +3,8 @@
 
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1
 
-; ERROR: error: <unknown>:0:0: stack frame size (131061) exceeds limit (131056) in function 'stack_size_limit_wave64'
-; GCN: ; ScratchSize: 131061
+; ERROR: error: <unknown>:0:0: stack frame size (131064) exceeds limit (131056) in function 'stack_size_limit_wave64'
+; GCN: ; ScratchSize: 131064
 define amdgpu_kernel void @stack_size_limit_wave64() #0 {
 entry:
   %alloca = alloca [131057 x i8], align 1, addrspace(5)
@@ -12,8 +12,8 @@ entry:
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: stack frame size (262117) exceeds limit (262112) in function 'stack_size_limit_wave32'
-; GCN: ; ScratchSize: 262117
+; ERROR: error: <unknown>:0:0: stack frame size (262120) exceeds limit (262112) in function 'stack_size_limit_wave32'
+; GCN: ; ScratchSize: 262120
 define amdgpu_kernel void @stack_size_limit_wave32() #1 {
 entry:
   %alloca = alloca [262113 x i8], align 1, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index d8db2d531986..8c5b89429bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -878,7 +878,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s6, s0, 5
 ; WAVE32-OPT-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; WAVE32-OPT-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; WAVE32-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; WAVE32-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; WAVE32-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[8:11], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -904,7 +904,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-OPT-NEXT:    s_lshr_b32 s6, s0, 6
 ; WAVE64-OPT-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; WAVE64-OPT-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; WAVE64-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; WAVE64-OPT-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; WAVE64-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[8:11], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -935,10 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    s_lshr_b32 s0, s0, 5
 ; WAVE32-O0-NEXT:    v_writelane_b32 v3, s0, 1
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:4
+; WAVE32-O0-NEXT:    buffer_store_dword v3, off, s[20:23], 0
 ; WAVE32-O0-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-O0-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; WAVE32-O0-NEXT:    s_mov_b64 s[2:3], s[22:23]
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE32-O0-NEXT:    s_or_saveexec_b32 s19, -1
-; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:132 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s19
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    v_readlane_b32 s1, v0, 1
@@ -1053,10 +1053,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    s_lshr_b32 s0, s0, 6
 ; WAVE64-O0-NEXT:    v_writelane_b32 v3, s0, 1
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
+; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:4
+; WAVE64-O0-NEXT:    buffer_store_dword v3, off, s[24:27], 0
 ; WAVE64-O0-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE64-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; WAVE64-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
@@ -1138,7 +1138,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE64-O0-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE64-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; WAVE64-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
+; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-O0-NEXT:    v_readlane_b32 s1, v0, 1
@@ -1172,7 +1172,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s0, s0, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s0, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v3, 42
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v3, off, s[20:23], 0 offset:4
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v3, off, s[20:23], 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[0:1], s[20:21]
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b64 s[2:3], s[22:23]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
index 4faaf60ef113..89d89a7e9f60 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx900
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
index 2079db73c1e4..bc57c9986592 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
index 5fa49c53eb9f..51351c32d29a 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
+; ASM:     - 1
 ; ASM4:    - 1
-; ASM5:    - 2
-; ASM6:    - 3
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
index 0d0a8d80dfdd..f408cbefaca1 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
+; ASM:     - 1
 ; ASM4:    - 1
-; ASM5:    - 2
-; ASM6:    - 3
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
index c29fb1f0adf6..78b337640202 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
index 8f6a4ff8639f..d1c98c7da357 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
index f24e0b23f52c..adf84db400c1 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
index 1493004cd4fb..210b2e84af10 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
index f0af6ca86452..44e77a23fba0 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
index 5501ce92e078..3205dbeb9e66 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
index 4cec639436df..6e7c575f1a96 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
@@ -1,6 +1,6 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
 
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@@ -13,10 +13,9 @@
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
 ; ASM:  amdhsa.version:
-; ASM:    - 1
-; ASM4:   - 1
-; ASM5:   - 2
-; ASM6:   - 3
+; ASM:     - 1
+; ASM4:    - 1
+; ASM56:   - 2
 
 ; ELF:      OS/ABI: AMDGPU_HSA (0x40)
 ; ELF4:      ABIVersion: 2
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
new file mode 100644
index 000000000000..0cb408676552
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
+
+define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
+; GFX90A-LABEL: test_insert_extract:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT:    s_mov_b32 s2, 0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, -1
+; GFX90A-NEXT:    s_mov_b32 s3, 0
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s5, 0
+; GFX90A-NEXT:    s_mov_b32 s6, 0
+; GFX90A-NEXT:  .LBB0_1: ; %for.body
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX90A-NEXT:    s_or_b32 s7, s7, s0
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX90A-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX90A-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX90A-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX90A-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX90A-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX90A-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX90A-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX90A-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX90A-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX90A-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX90A-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX90A-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX90A-NEXT:    s_mov_b64 vcc, vcc
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX940-LABEL: test_insert_extract:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NEXT:    s_mov_b32 s2, 0
+; GFX940-NEXT:    s_and_b64 vcc, exec, -1
+; GFX940-NEXT:    s_mov_b32 s3, 0
+; GFX940-NEXT:    s_mov_b32 s4, 0
+; GFX940-NEXT:    s_mov_b32 s5, 0
+; GFX940-NEXT:    s_mov_b32 s6, 0
+; GFX940-NEXT:  .LBB0_1: ; %for.body
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX940-NEXT:    s_or_b32 s7, s7, s0
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX940-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX940-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX940-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX940-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX940-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX940-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX940-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX940-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX940-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX940-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX940-NEXT:    s_mov_b64 vcc, vcc
+; GFX940-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX940-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX940-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: test_insert_extract:
+; GFX1030:       ; %bb.0: ; %entry
+; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1030-NEXT:    s_mov_b32 s2, 0
+; GFX1030-NEXT:    s_mov_b32 s3, 0
+; GFX1030-NEXT:    s_mov_b32 s4, 0
+; GFX1030-NEXT:    s_mov_b32 s5, 0
+; GFX1030-NEXT:    s_mov_b32 s6, 0
+; GFX1030-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX1030-NEXT:    .p2align 6
+; GFX1030-NEXT:  .LBB0_1: ; %for.body
+; GFX1030-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1030-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX1030-NEXT:    s_or_b32 s7, s7, s0
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1030-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1030-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1030-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1030-NEXT:    s_and_b32 s10, s9, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1030-NEXT:    s_cselect_b32 s10, -1, 0
+; GFX1030-NEXT:    s_and_b32 s11, s10, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1030-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX1030-NEXT:    s_or_b32 s7, s10, s8
+; GFX1030-NEXT:    s_or_b32 s7, s9, s7
+; GFX1030-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1030-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX1030-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX1030-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX1030-NEXT:    s_endpgm
+;
+; GFX1100-LABEL: test_insert_extract:
+; GFX1100:       ; %bb.0: ; %entry
+; GFX1100-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1100-NEXT:    s_mov_b32 s2, 0
+; GFX1100-NEXT:    s_mov_b32 s3, 0
+; GFX1100-NEXT:    s_mov_b32 s4, 0
+; GFX1100-NEXT:    s_mov_b32 s5, 0
+; GFX1100-NEXT:    s_mov_b32 s6, 0
+; GFX1100-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX1100-NEXT:    .p2align 6
+; GFX1100-NEXT:  .LBB0_1: ; %for.body
+; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1100-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s4, s3
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s5, s7
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s7, s6, s7
+; GFX1100-NEXT:    s_or_b32 s7, s7, s0
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX1100-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 3
+; GFX1100-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1100-NEXT:    s_and_b32 s10, s9, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 2
+; GFX1100-NEXT:    s_cselect_b32 s10, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s11, s10, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX1100-NEXT:    s_or_b32 s7, s10, s8
+; GFX1100-NEXT:    s_or_b32 s7, s9, s7
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s2, 0, s2
+; GFX1100-NEXT:    s_cbranch_vccnz .LBB0_1
+; GFX1100-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; GFX1100-NEXT:    s_endpgm
+entry:
+  %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0
+  br label %for.body
+
+for.body:                                     ; preds = %for.body, %entry
+  %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ]
+  %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ]
+  %idxprom = zext i32 %q to i64
+  %e1 = extractelement <4 x i32> %x2, i64 %idxprom
+  %add = or i32 %e1, %p
+  %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom
+  %e3 = extractelement <4 x i32> %x1, i64 %idxprom
+  %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0
+  br label %for.body
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
index 0878fc689cd4..b08586efe2f2 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-sample.ll
@@ -1,15 +1,13 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
 ; RUN: not --crash llc -O0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX940 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s
 
-; GFX906-LABEL: image_sample_test:
-; GFX906: image_sample_lz
-
-; GFX908-LABEL: image_sample_test:
-; GFX908: image_sample_lz
+; GFX9-LABEL: image_sample_test:
+; GFX9: image_sample_lz
 
 ; GFX90A: LLVM ERROR: requested image instruction is not supported on this GPU
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 808f006e7675..137bd0f5d9f1 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -16,33 +16,33 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -83,16 +83,16 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; CHECK-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_2
@@ -101,23 +101,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
@@ -134,7 +138,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -142,10 +146,6 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
-; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
@@ -159,23 +159,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
 ; CHECK-NEXT:    global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
@@ -192,7 +196,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
@@ -200,17 +204,13 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
-; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:  .LBB1_4: ; %.exit
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 7aaf9452e956..0cabfa9aea0e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -25,7 +25,7 @@
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 768
+; GCN: ScratchSize: 640
 
 define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 3a3860d18079..a1d3e2a093fa 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -16,12 +16,12 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    global_load_ushort v3, v1, s[4:5] offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
@@ -32,7 +32,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v0, s5, 1
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
@@ -40,20 +40,20 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3: ; %bb201
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, V2@rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, V2@rel32@hi+12
@@ -66,7 +66,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; divergent unreachable
 ; CHECK-NEXT:  .LBB0_4: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
 ; CHECK-NEXT:    ; kill: killed $vgpr0
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 78e8ab1b1676..f78b408d7825 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -433,416 +433,416 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
 ; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
 ; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
 ; GFX906-NEXT:    s_nop 0
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
@@ -853,494 +853,494 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
 ; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
 ; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
 ; GFX906-NEXT:    s_nop 0
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
 ; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1349,42 +1349,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
 ; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1393,42 +1393,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
 ; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1437,42 +1437,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
 ; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1481,42 +1481,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
 ; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1525,42 +1525,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
 ; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1569,42 +1569,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
 ; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1613,42 +1613,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
 ; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1657,42 +1657,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
 ; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1701,42 +1701,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
 ; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1745,42 +1745,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
 ; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1789,36 +1789,36 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1826,27 +1826,27 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1854,9 +1854,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1864,8 +1864,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1874,21 +1874,21 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1896,9 +1896,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1906,8 +1906,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -1916,15 +1916,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(7)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
@@ -1934,9 +1934,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1944,9 +1944,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -1954,8 +1954,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 46ff9a99f29a..95dfb12c8dba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2035,9 +2035,9 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
-; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
+; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
@@ -2059,11 +2059,11 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 4
+; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 0
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
+; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-W32-NEXT:    buffer_load_dword v1, v2, s[8:11], 0 offen
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 6003d036c65b..47c976d2a5c3 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -124,7 +124,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
@@ -150,9 +150,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_nop 2
 ; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -165,22 +165,22 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -195,10 +195,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB1_2: ; %merge
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
@@ -208,8 +208,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -349,7 +349,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
@@ -358,7 +358,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
@@ -388,7 +388,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -415,7 +415,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
@@ -427,7 +427,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
@@ -584,7 +584,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
@@ -593,7 +593,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -624,7 +624,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -659,7 +659,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
@@ -670,7 +670,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
@@ -994,7 +994,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
@@ -1020,9 +1020,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_nop 2
 ; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -1035,22 +1035,22 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -1065,10 +1065,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:  .LBB8_2: ; %merge
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
@@ -1078,8 +1078,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1219,7 +1219,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
@@ -1258,7 +1258,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -1285,7 +1285,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
@@ -1297,7 +1297,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1454,7 +1454,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
@@ -1463,7 +1463,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -1494,7 +1494,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
@@ -1529,7 +1529,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
@@ -1540,7 +1540,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
index 3d18a65bd434..f5aea62d4be7 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/fpenv.ll
@@ -74,5 +74,19 @@ entry:
   ret void
 }
 
+define void @func_reset_soft() #0 {
+; CHECK-LABEL: func_reset_soft:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    mvn r0, #0
+; CHECK-NEXT:    bl fesetenv
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  call void @llvm.reset.fpenv()
+  ret void
+}
+
 attributes #0 = { nounwind "use-soft-float"="true" }
 
diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
index 6000b9bc56b9..9ca1b1790a8c 100644
--- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
@@ -42,7 +42,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    } // 8-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0 = vsplat(r16)
-; CHECK-NEXT:     vmem(r29+#6) = v0.new
+; CHECK-NEXT:     vmem(r29+#2) = v0.new
 ; CHECK-NEXT:    } // 128-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     q0 = vand(v0,r0)
@@ -56,7 +56,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v0 = vand(q0,r17)
 ; CHECK-NEXT:     r19 = ##g0+128
-; CHECK-NEXT:     vmem(r29+#7) = v0.new
+; CHECK-NEXT:     vmem(r29+#1) = v0.new
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r20 = ##g0
@@ -78,15 +78,15 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:     vmem(r20+#0) = v30
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0 = vmem(r29+#6)
+; CHECK-NEXT:     v0 = vmem(r29+#2)
 ; CHECK-NEXT:    } // 128-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.h = vadd(v0.ub,v0.ub)
 ; CHECK-NEXT:     r0 = ##g2
-; CHECK-NEXT:     vmem(r29+#1) = v0.new
+; CHECK-NEXT:     vmem(r29+#6) = v0.new
 ; CHECK-NEXT:    } // 256-byte Folded Spill
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     vmem(r29+#2) = v1
+; CHECK-NEXT:     vmem(r29+#7) = v1
 ; CHECK-NEXT:    } // 256-byte Folded Spill
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.uw = vrmpy(v1:0.ub,r17.ub,#0)
@@ -98,10 +98,10 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = ##2147483647
-; CHECK-NEXT:     v0 = vmem(r29+#1)
+; CHECK-NEXT:     v0 = vmem(r29+#6)
 ; CHECK-NEXT:    } // 256-byte Folded Reload
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmem(r29+#2)
+; CHECK-NEXT:     v1 = vmem(r29+#7)
 ; CHECK-NEXT:    } // 256-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1:0.uw = vrmpy(v1:0.ub,r0.ub,#1)
@@ -142,7 +142,7 @@ define dso_local void @f2() #0 {
 ; CHECK-NEXT:     vmem(r20+#0) = v0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v0 = vmem(r29+#6)
+; CHECK-NEXT:     v0 = vmem(r29+#2)
 ; CHECK-NEXT:    } // 128-byte Folded Reload
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     v1 = vmem(r29+#3)
diff --git a/llvm/test/CodeGen/Hexagon/order-stack-object.ll b/llvm/test/CodeGen/Hexagon/order-stack-object.ll
new file mode 100644
index 000000000000..bdc16e928eae
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/order-stack-object.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mattr=+hvxv68,+hvx-length128b < %s | FileCheck %s
+
+; Check that ordering objects on the stack from the largest to the smallest has
+; decreased the space allocated on the stack by 512 Bytes.
+
+; CHECK: allocframe(r29,#2432):raw
+
+define void @test(ptr nocapture readonly %arg, ptr nocapture writeonly %arg1, i32 %arg2) local_unnamed_addr {
+bb:
+  %shl = shl i32 %arg2, 5
+  br label %bb3
+
+bb3:
+  %phi = phi i32 [ 0, %bb ], [ %add13, %bb3 ]
+  %add = add i32 %phi, %shl
+  %sext = sext i32 %add to i64
+  %getelementptr = getelementptr float, ptr %arg, i64 %sext
+  %load = load <32 x float>, ptr %getelementptr, align 4
+  %fmul = fmul <32 x float> %load, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+  %fmul4 = fmul <32 x float> %load, <float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000, float 0x3FE9884620000000>
+  %fmul5 = fmul <32 x float> %load, <float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000, float 0x3FA2444180000000>
+  %fmul6 = fmul <32 x float> %load, %fmul5
+  %fmul7 = fmul <32 x float> %load, %fmul6
+  %fadd = fadd <32 x float> %fmul4, %fmul7
+  %fmul8 = fmul <32 x float> %fadd, <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+  %call = tail call <32 x float> @llvm.exp.v32f32(<32 x float> %fmul8)
+  %fsub = fsub <32 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %call
+  %fadd9 = fadd <32 x float> %call, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %fdiv = fdiv <32 x float> %fsub, %fadd9
+  %fadd10 = fadd <32 x float> %fdiv, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %fmul11 = fmul <32 x float> %fmul, %fadd10
+  %getelementptr12 = getelementptr float, ptr %arg1, i64 %sext
+  store <32 x float> %fmul11, ptr %getelementptr12, align 128
+  %add13 = add nuw nsw i32 %phi, 128
+  %icmp = icmp ult i32 %phi, 8064
+  br i1 %icmp, label %bb3, label %bb14
+
+bb14:
+  ret void
+}
+
+declare <32 x float> @llvm.exp.v32f32(<32 x float>)
diff --git a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
index 8c5b11d5f155..bb9f7cf7e6bc 100644
--- a/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
+++ b/llvm/test/CodeGen/Hexagon/store-imm-stack-object.ll
@@ -3,10 +3,10 @@
 target triple = "hexagon"
 
 ; CHECK-LABEL: test1:
-; CHECK-DAG: memw(r29+#4) = ##875770417
+; CHECK-DAG: memw(r29+#12) = ##875770417
 ; CHECK-DAG: memw(r29+#8) = #51
-; CHECK-DAG: memh(r29+#12) = #50
-; CHECK-DAG: memb(r29+#15) = #49
+; CHECK-DAG: memh(r29+#6) = #50
+; CHECK-DAG: memb(r29+#5) = #49
 define void @test1() {
 b0:
   %v1 = alloca [1 x i8], align 1
@@ -30,9 +30,9 @@ b0:
 }
 
 ; CHECK-LABEL: test2:
-; CHECK-DAG: memw(r29+#208) = #51
-; CHECK-DAG: memh(r29+#212) = r{{[0-9]+}}
-; CHECK-DAG: memb(r29+#215) = r{{[0-9]+}}
+; CHECK-DAG: memw(r29+#8) = #51
+; CHECK-DAG: memh(r29+#6) = r{{[0-9]+}}
+; CHECK-DAG: memb(r29+#5) = r{{[0-9]+}}
 define void @test2() {
 b0:
   %v1 = alloca [1 x i8], align 1
diff --git a/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
new file mode 100644
index 000000000000..b3c44affb785
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir
@@ -0,0 +1,212 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -o - %s -mtriple=loongarch64 \
+# RUN:  -run-pass=register-coalescer -join-liveintervals=1 -join-splitedges=0 | FileCheck %s
+
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r4, $r5, $r6, $r7, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r8
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $r5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $r4
+  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
+  ; CHECK-NEXT:   [[ORI:%[0-9]+]]:gpr = ORI $r0, 1
+  ; CHECK-NEXT:   [[ANDI1:%[0-9]+]]:gpr = ANDI [[COPY2]], 1
+  ; CHECK-NEXT:   [[ANDI2:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
+  ; CHECK-NEXT:   [[ANDI3:%[0-9]+]]:gpr = ANDI [[COPY]], 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr = COPY [[COPY5]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI]], %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[LD_D:%[0-9]+]]:gpr = LD_D $r0, 8
+  ; CHECK-NEXT:   dead [[LD_D1:%[0-9]+]]:gpr = LD_D $r0, 0
+  ; CHECK-NEXT:   BNEZ [[ANDI1]], %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   PseudoBR %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQZ [[ANDI2]], %bb.10
+  ; CHECK-NEXT:   PseudoBR %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   BEQZ [[ANDI3]], %bb.5
+  ; CHECK-NEXT:   PseudoBR %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_B $r0, [[COPY4]], 0
+  ; CHECK-NEXT:   PseudoBR %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = ADDI_D [[COPY6]], 1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr = COPY [[ORI]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ST_D $r0, [[COPY4]], 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BEQ [[COPY7]], [[ORI]], %bb.2
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  bb.0:
+    liveins: $r4, $r5, $r6, $r7, $r8
+
+    %0:gpr = COPY killed $r8
+    %1:gpr = COPY killed $r7
+    %2:gpr = COPY killed $r6
+    %3:gpr = COPY killed $r5
+    %4:gpr = COPY killed $r4
+    %5:gpr = COPY $r0
+    %6:gpr = COPY killed %5
+    %7:gpr = ANDI killed %3, 1
+    %8:gpr = ORI $r0, 1
+    %9:gpr = ANDI killed %2, 1
+    %10:gpr = ANDI killed %1, 1
+    %11:gpr = ANDI killed %0, 1
+    %12:gpr = COPY %6
+    %13:gpr = COPY killed %6
+    %14:gpr = IMPLICIT_DEF
+
+  bb.1:
+    %15:gpr = COPY killed %14
+    %16:gpr = COPY killed %13
+    %17:gpr = COPY killed %12
+    %18:gpr = COPY %17
+    %19:gpr = COPY %16
+    %20:gpr = COPY killed %16
+    %21:gpr = COPY killed %15
+
+  bb.2:
+    successors: %bb.3, %bb.4
+
+    %22:gpr = COPY killed %21
+    %23:gpr = COPY killed %20
+    %24:gpr = COPY killed %19
+    %25:gpr = COPY killed %18
+    BEQZ %7, %bb.4
+
+  bb.3:
+    %26:gpr = COPY killed %24
+    %27:gpr = COPY killed %23
+    PseudoBR %bb.9
+
+  bb.4:
+    %28:gpr = COPY killed %23
+
+  bb.5:
+    successors: %bb.7(0x7c000000), %bb.6(0x04000000)
+
+    %29:gpr = COPY killed %28
+    dead %30:gpr = LD_D $r0, 8
+    dead %31:gpr = LD_D $r0, 0
+    BNEZ %9, %bb.7
+
+  bb.6:
+    %32:gpr = COPY $r0
+    %33:gpr = COPY killed %32
+    %34:gpr = COPY killed %33
+    %35:gpr = COPY killed %22
+    PseudoBR %bb.11
+
+  bb.7:
+    successors: %bb.8(0x7c000000), %bb.10(0x04000000)
+
+    BEQZ %10, %bb.10
+    PseudoBR %bb.8
+
+  bb.8:
+    successors: %bb.9(0x04000000), %bb.5(0x7c000000)
+
+    %36:gpr = ADDI_D killed %29, 1
+    %28:gpr = COPY %36
+    %26:gpr = COPY %36
+    %27:gpr = COPY killed %36
+    BEQZ %11, %bb.5
+    PseudoBR %bb.9
+
+  bb.9:
+    %37:gpr = COPY killed %27
+    %38:gpr = COPY killed %26
+    %39:gpr = COPY $r0
+    ST_B killed %39, %4, 0
+    %40:gpr = COPY killed %25
+    %41:gpr = COPY killed %38
+    %42:gpr = COPY killed %37
+    %43:gpr = COPY killed %22
+    PseudoBR %bb.12
+
+  bb.10:
+    %44:gpr = ADDI_D killed %29, 1
+    %34:gpr = COPY %8
+    %35:gpr = COPY killed %44
+
+  bb.11:
+    %45:gpr = COPY killed %35
+    %46:gpr = COPY killed %34
+    %47:gpr = COPY $r0
+    ST_D killed %47, %4, 0
+    %40:gpr = COPY %45
+    %41:gpr = COPY %46
+    %42:gpr = COPY killed %46
+    %43:gpr = COPY killed %45
+
+  bb.12:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+
+    %48:gpr = COPY killed %43
+    %49:gpr = COPY killed %42
+    %50:gpr = COPY killed %41
+    %51:gpr = COPY killed %40
+    %12:gpr = COPY %51
+    %13:gpr = COPY %50
+    %14:gpr = COPY %48
+    %18:gpr = COPY killed %51
+    %19:gpr = COPY killed %50
+    %20:gpr = COPY killed %49
+    %21:gpr = COPY killed %48
+    BEQ %17, %8, %bb.2
+    PseudoBR %bb.1
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index abb980661fc1..2588d88b002b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -38,7 +38,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
-; CHECK-NEXT:   scavengeFI:      '%fixed-stack.0'
+; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
@@ -303,7 +303,7 @@
 ; CHECK-NEXT:     fp64-fp16-output-denormals: true
 ; CHECK-NEXT:   highBitsOf32BitAddress: 0
 ; CHECK-NEXT:   occupancy:       5
-; CHECK-NEXT:   scavengeFI:      '%fixed-stack.0'
+; CHECK-NEXT:   scavengeFI:      '%stack.0'
 ; CHECK-NEXT:   vgprForAGPRCopy: ''
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index f2144b8f6fc4..9939366e855c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -38,7 +38,7 @@
 ; AFTER-PEI-NEXT:   fp64-fp16-output-denormals: true
 ; AFTER-PEI-NEXT: highBitsOf32BitAddress: 0
 ; AFTER-PEI-NEXT: occupancy: 5
-; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0'
+; AFTER-PEI-NEXT: scavengeFI: '%stack.3'
 ; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
 ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
 ; AFTER-PEI-NEXT: longBranchReservedReg: ''
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
index 2de4096764e7..13116329fe1a 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/branch.mir
@@ -77,7 +77,7 @@ body:             |
   ; MIPS32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32-NEXT: {{  $}}
   ; MIPS32-NEXT: bb.1.if.then:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
index b8450fffdb98..6022e7ac4828 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/jump_table_and_brjt.mir
@@ -73,139 +73,174 @@ jumpTable:
 body:             |
   ; MIPS32-LABEL: name: mod4_0_to_11
   ; MIPS32: bb.0.entry:
-  ; MIPS32:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
-  ; MIPS32:   liveins: $a0
-  ; MIPS32:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
-  ; MIPS32:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
-  ; MIPS32:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
-  ; MIPS32:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
-  ; MIPS32:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
-  ; MIPS32:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
-  ; MIPS32:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
-  ; MIPS32:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
-  ; MIPS32:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
-  ; MIPS32:   BNE [[ANDi]], $zero, %bb.6, implicit-def $at
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
-  ; MIPS32:   [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0
-  ; MIPS32:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
-  ; MIPS32:   [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]]
-  ; MIPS32:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
-  ; MIPS32:   PseudoIndirectBranch [[LW]]
-  ; MIPS32: bb.2.sw.bb:
-  ; MIPS32:   $v0 = COPY [[ORi4]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.3.sw.bb1:
-  ; MIPS32:   $v0 = COPY [[ORi3]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.4.sw.bb2:
-  ; MIPS32:   $v0 = COPY [[ORi2]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.5.sw.bb3:
-  ; MIPS32:   $v0 = COPY [[ORi1]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.6.sw.default:
-  ; MIPS32:   successors: %bb.7(0x80000000)
-  ; MIPS32: bb.7.sw.epilog:
-  ; MIPS32:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
-  ; MIPS32:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
-  ; MIPS32:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
-  ; MIPS32:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
-  ; MIPS32:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
-  ; MIPS32:   BNE [[ANDi1]], $zero, %bb.13, implicit-def $at
-  ; MIPS32: bb.8.sw.epilog:
-  ; MIPS32:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; MIPS32:   [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1
-  ; MIPS32:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
-  ; MIPS32:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]]
-  ; MIPS32:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
-  ; MIPS32:   PseudoIndirectBranch [[LW1]]
-  ; MIPS32: bb.9.sw.bb4:
-  ; MIPS32:   $v0 = COPY [[ORi4]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.10.sw.bb5:
-  ; MIPS32:   $v0 = COPY [[ORi3]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.11.sw.bb6:
-  ; MIPS32:   $v0 = COPY [[ORi2]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.12.sw.bb7:
-  ; MIPS32:   $v0 = COPY [[ORi1]]
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.13.sw.default8:
-  ; MIPS32:   $v0 = COPY [[ADDiu]]
-  ; MIPS32:   RetRA implicit $v0
+  ; MIPS32-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
+  ; MIPS32-NEXT:   liveins: $a0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+  ; MIPS32-NEXT:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
+  ; MIPS32-NEXT:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
+  ; MIPS32-NEXT:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
+  ; MIPS32-NEXT:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
+  ; MIPS32-NEXT:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
+  ; MIPS32-NEXT:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32-NEXT:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
+  ; MIPS32-NEXT:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
+  ; MIPS32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
+  ; MIPS32-NEXT:   BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.1.entry:
+  ; MIPS32-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[LUi:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.0
+  ; MIPS32-NEXT:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
+  ; MIPS32-NEXT:   [[ADDu:%[0-9]+]]:gpr32 = ADDu [[LUi]], [[SLL]]
+  ; MIPS32-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
+  ; MIPS32-NEXT:   PseudoIndirectBranch [[LW]]
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.2.sw.bb:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.3.sw.bb1:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.4.sw.bb2:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.5.sw.bb3:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.6.sw.default:
+  ; MIPS32-NEXT:   successors: %bb.7(0x80000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.7.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
+  ; MIPS32-NEXT:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
+  ; MIPS32-NEXT:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
+  ; MIPS32-NEXT:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
+  ; MIPS32-NEXT:   BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.8.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[LUi1:%[0-9]+]]:gpr32 = LUi target-flags(mips-abs-hi) %jump-table.1
+  ; MIPS32-NEXT:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
+  ; MIPS32-NEXT:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LUi1]], [[SLL1]]
+  ; MIPS32-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
+  ; MIPS32-NEXT:   PseudoIndirectBranch [[LW1]]
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.9.sw.bb4:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.10.sw.bb5:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.11.sw.bb6:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.12.sw.bb7:
+  ; MIPS32-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.13.sw.default8:
+  ; MIPS32-NEXT:   $v0 = COPY [[ADDiu]]
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ;
   ; MIPS32_PIC-LABEL: name: mod4_0_to_11
   ; MIPS32_PIC: bb.0.entry:
-  ; MIPS32_PIC:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
-  ; MIPS32_PIC:   liveins: $a0, $t9, $v0
-  ; MIPS32_PIC:   [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9
-  ; MIPS32_PIC:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
-  ; MIPS32_PIC:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
-  ; MIPS32_PIC:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
-  ; MIPS32_PIC:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
-  ; MIPS32_PIC:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
-  ; MIPS32_PIC:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32_PIC:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
-  ; MIPS32_PIC:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
-  ; MIPS32_PIC:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
-  ; MIPS32_PIC:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
-  ; MIPS32_PIC:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
-  ; MIPS32_PIC:   BNE [[ANDi]], $zero, %bb.6, implicit-def $at
-  ; MIPS32_PIC: bb.1.entry:
-  ; MIPS32_PIC:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
-  ; MIPS32_PIC:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got)
-  ; MIPS32_PIC:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
-  ; MIPS32_PIC:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]]
-  ; MIPS32_PIC:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
-  ; MIPS32_PIC:   [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]]
-  ; MIPS32_PIC:   PseudoIndirectBranch [[ADDu2]]
-  ; MIPS32_PIC: bb.2.sw.bb:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi4]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.3.sw.bb1:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi3]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.4.sw.bb2:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi2]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.5.sw.bb3:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi1]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.6.sw.default:
-  ; MIPS32_PIC:   successors: %bb.7(0x80000000)
-  ; MIPS32_PIC: bb.7.sw.epilog:
-  ; MIPS32_PIC:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
-  ; MIPS32_PIC:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
-  ; MIPS32_PIC:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
-  ; MIPS32_PIC:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
-  ; MIPS32_PIC:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
-  ; MIPS32_PIC:   BNE [[ANDi1]], $zero, %bb.13, implicit-def $at
-  ; MIPS32_PIC: bb.8.sw.epilog:
-  ; MIPS32_PIC:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; MIPS32_PIC:   [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got)
-  ; MIPS32_PIC:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
-  ; MIPS32_PIC:   [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]]
-  ; MIPS32_PIC:   [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
-  ; MIPS32_PIC:   [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]]
-  ; MIPS32_PIC:   PseudoIndirectBranch [[ADDu4]]
-  ; MIPS32_PIC: bb.9.sw.bb4:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi4]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.10.sw.bb5:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi3]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.11.sw.bb6:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi2]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.12.sw.bb7:
-  ; MIPS32_PIC:   $v0 = COPY [[ORi1]]
-  ; MIPS32_PIC:   RetRA implicit $v0
-  ; MIPS32_PIC: bb.13.sw.default8:
-  ; MIPS32_PIC:   $v0 = COPY [[ADDiu]]
-  ; MIPS32_PIC:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
+  ; MIPS32_PIC-NEXT:   liveins: $a0, $t9, $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[ADDu:%[0-9]+]]:gpr32 = ADDu $v0, $t9
+  ; MIPS32_PIC-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $a0
+  ; MIPS32_PIC-NEXT:   [[ORi:%[0-9]+]]:gpr32 = ORi $zero, 7
+  ; MIPS32_PIC-NEXT:   [[ORi1:%[0-9]+]]:gpr32 = ORi $zero, 3
+  ; MIPS32_PIC-NEXT:   [[ORi2:%[0-9]+]]:gpr32 = ORi $zero, 2
+  ; MIPS32_PIC-NEXT:   [[ORi3:%[0-9]+]]:gpr32 = ORi $zero, 1
+  ; MIPS32_PIC-NEXT:   [[ORi4:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32_PIC-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu $zero, 65535
+  ; MIPS32_PIC-NEXT:   [[ORi5:%[0-9]+]]:gpr32 = ORi $zero, 0
+  ; MIPS32_PIC-NEXT:   [[SUBu:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi5]]
+  ; MIPS32_PIC-NEXT:   [[SLTu:%[0-9]+]]:gpr32 = SLTu [[ORi]], [[SUBu]]
+  ; MIPS32_PIC-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[SLTu]], 1
+  ; MIPS32_PIC-NEXT:   BNE [[ANDi]], $zero, %bb.6, implicit-def dead $at
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.1.entry:
+  ; MIPS32_PIC-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.0 :: (load (s32) from got)
+  ; MIPS32_PIC-NEXT:   [[SLL:%[0-9]+]]:gpr32 = SLL [[SUBu]], 2
+  ; MIPS32_PIC-NEXT:   [[ADDu1:%[0-9]+]]:gpr32 = ADDu [[LW]], [[SLL]]
+  ; MIPS32_PIC-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDu1]], target-flags(mips-abs-lo) %jump-table.0 :: (load (s32))
+  ; MIPS32_PIC-NEXT:   [[ADDu2:%[0-9]+]]:gpr32 = ADDu [[LW1]], [[ADDu]]
+  ; MIPS32_PIC-NEXT:   PseudoIndirectBranch [[ADDu2]]
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.2.sw.bb:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.3.sw.bb1:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.4.sw.bb2:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.5.sw.bb3:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.6.sw.default:
+  ; MIPS32_PIC-NEXT:   successors: %bb.7(0x80000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.7.sw.epilog:
+  ; MIPS32_PIC-NEXT:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[ORi6:%[0-9]+]]:gpr32 = ORi $zero, 8
+  ; MIPS32_PIC-NEXT:   [[SUBu1:%[0-9]+]]:gpr32 = SUBu [[COPY]], [[ORi6]]
+  ; MIPS32_PIC-NEXT:   [[SLTu1:%[0-9]+]]:gpr32 = SLTu [[ORi1]], [[SUBu1]]
+  ; MIPS32_PIC-NEXT:   [[ANDi1:%[0-9]+]]:gpr32 = ANDi [[SLTu1]], 1
+  ; MIPS32_PIC-NEXT:   BNE [[ANDi1]], $zero, %bb.13, implicit-def dead $at
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.8.sw.epilog:
+  ; MIPS32_PIC-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT:   [[LW2:%[0-9]+]]:gpr32 = LW [[ADDu]], target-flags(mips-got) %jump-table.1 :: (load (s32) from got)
+  ; MIPS32_PIC-NEXT:   [[SLL1:%[0-9]+]]:gpr32 = SLL [[SUBu1]], 2
+  ; MIPS32_PIC-NEXT:   [[ADDu3:%[0-9]+]]:gpr32 = ADDu [[LW2]], [[SLL1]]
+  ; MIPS32_PIC-NEXT:   [[LW3:%[0-9]+]]:gpr32 = LW [[ADDu3]], target-flags(mips-abs-lo) %jump-table.1 :: (load (s32))
+  ; MIPS32_PIC-NEXT:   [[ADDu4:%[0-9]+]]:gpr32 = ADDu [[LW3]], [[ADDu]]
+  ; MIPS32_PIC-NEXT:   PseudoIndirectBranch [[ADDu4]]
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.9.sw.bb4:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi4]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.10.sw.bb5:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi3]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.11.sw.bb6:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi2]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.12.sw.bb7:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ORi1]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
+  ; MIPS32_PIC-NEXT: {{  $}}
+  ; MIPS32_PIC-NEXT: bb.13.sw.default8:
+  ; MIPS32_PIC-NEXT:   $v0 = COPY [[ADDiu]]
+  ; MIPS32_PIC-NEXT:   RetRA implicit $v0
   bb.1.entry:
     liveins: $a0
 
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
index 77e5ee2fdcbc..44d31d99efca 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/instruction-select/phi.mir
@@ -80,7 +80,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -105,7 +105,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP64-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -166,7 +166,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0
   ; MIPS32FP32-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1)
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -197,7 +197,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[ADDiu1:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.1, 0
   ; MIPS32FP64-NEXT:   [[LW1:%[0-9]+]]:gpr32 = LW [[ADDiu1]], 0 :: (load (s32) from %fixed-stack.1)
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -259,7 +259,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP32-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -284,7 +284,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $a1
   ; MIPS32FP64-NEXT:   [[COPY2:%[0-9]+]]:gpr32 = COPY $a2
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[COPY]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
@@ -341,7 +341,7 @@ body:             |
   ; MIPS32FP32-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0
   ; MIPS32FP32-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8)
   ; MIPS32FP32-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1
-  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP32-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP32-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP32-NEXT: {{  $}}
   ; MIPS32FP32-NEXT: bb.1.cond.true:
@@ -367,7 +367,7 @@ body:             |
   ; MIPS32FP64-NEXT:   [[ADDiu:%[0-9]+]]:gpr32 = ADDiu %fixed-stack.0, 0
   ; MIPS32FP64-NEXT:   [[LW:%[0-9]+]]:gpr32 = LW [[ADDiu]], 0 :: (load (s32) from %fixed-stack.0, align 8)
   ; MIPS32FP64-NEXT:   [[ANDi:%[0-9]+]]:gpr32 = ANDi [[LW]], 1
-  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def $at
+  ; MIPS32FP64-NEXT:   BNE [[ANDi]], $zero, %bb.1, implicit-def dead $at
   ; MIPS32FP64-NEXT:   J %bb.2, implicit-def dead $at
   ; MIPS32FP64-NEXT: {{  $}}
   ; MIPS32FP64-NEXT: bb.1.cond.true:
diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll
index d5dc3a12029a..cb592ddf7c95 100644
--- a/llvm/test/CodeGen/NVPTX/alias.ll
+++ b/llvm/test/CodeGen/NVPTX/alias.ll
@@ -1,8 +1,10 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %}
 
 define i32 @a() { ret i32 0 }
 @b = internal alias i32 (), ptr @a
 @c = internal alias i32 (), ptr @a
+@d = internal alias i32 (), ptr @c
 
 define void @foo(i32 %0, ptr %1) { ret void }
 @bar = alias i32 (), ptr @foo
@@ -12,8 +14,37 @@ define void @noreturn() #0 {
 }
 @noreturn_alias = alias i32 (), ptr @noreturn
 
+define i32 @z() {
+  %val = call i32 @b()
+  ret i32 %val
+}
+
+
 attributes #0 = { noreturn }
 
+;      CHECK: .visible .func  (.param .b32 func_retval0) b
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func  (.param .b32 func_retval0) c
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func  (.param .b32 func_retval0) d
+; CHECK-NEXT: ()
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func bar
+; CHECK-NEXT: (
+; CHECK-NEXT:         .param .b32 foo_param_0,
+; CHECK-NEXT:         .param .b64 foo_param_1
+; CHECK-NEXT: )
+; CHECK-NEXT: ;
+
+;      CHECK: .visible .func noreturn_alias
+; CHECK-NEXT: ()
+; CHECK-NEXT: .noreturn;
+
 ; CHECK: .visible .func  (.param .b32 func_retval0) a()
 
 ;      CHECK: .visible .func foo(
@@ -24,18 +55,13 @@ attributes #0 = { noreturn }
 ;      CHECK: .visible .func noreturn()
 ; CHECK-NEXT: .noreturn
 
-;      CHECK: .visible .func  (.param .b32 func_retval0) b();
-; CHECK-NEXT: .alias b, a;
+;      CHECK: .visible .func  (.param .b32 func_retval0) z()
+;      CHECK:      call.uni (retval0), 
+; CHECK-NEXT:      b,
 
-;      CHECK: .visible .func  (.param .b32 func_retval0) c();
-; CHECK-NEXT: .alias c, a;
 
-;      CHECK: .visible .func bar(
-; CHECK-NEXT:         .param .b32 foo_param_0,
-; CHECK-NEXT:         .param .b64 foo_param_1
-; CHECK-NEXT: );
-; CHECK-NEXT: .alias bar, foo;
-
-;      CHECK: .visible .func noreturn_alias()
-; CHECK-NEXT: .noreturn;
-; CHECK-NEXT: .alias noreturn_alias, noreturn;
+; CHECK: .alias b, a;
+; CHECK: .alias c, a;
+; CHECK: .alias d, a;
+; CHECK: .alias bar, foo;
+; CHECK: .alias noreturn_alias, noreturn;
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index ed7dd45ab7b4..92d00f881cc2 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -3,9 +3,9 @@
 
 
 ; CHECK-LABEL: test_v2i8
-; CHECK-DAG:        ld.param.u16    [[A:%rs[0-9+]]], [test_v2i8_param_0];
-; CHECK-DAG:        cvt.s16.s8      [[E0:%rs[0-9+]]], [[A]];
-; CHECK-DAG:        shr.s16         [[E1:%rs[0-9+]]], [[A]], 8;
+; CHECK-DAG:        ld.param.u16    [[A:%rs[0-9]+]], [test_v2i8_param_0];
+; CHECK-DAG:        cvt.s16.s8      [[E0:%rs[0-9]+]], [[A]];
+; CHECK-DAG:        shr.s16         [[E1:%rs[0-9]+]], [[A]], 8;
 define i16  @test_v2i8(i16 %a) {
   %v = bitcast i16 %a to <2 x i8>
   %r0 = extractelement <2 x i8> %v, i64 0
@@ -17,15 +17,15 @@ define i16  @test_v2i8(i16 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E0:%rs[0-9+]]], [[R0]];
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9+]]], [[R1]];
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9+]]], [[R2]];
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9+]]], [[R3]];
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_param_0];
+; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E0:%rs[0-9]+]], [[R0]];
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
 define i16  @test_v4i8(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -43,14 +43,14 @@ define i16  @test_v4i8(i32 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8_s32
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_s32_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
+; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_s32(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -68,14 +68,14 @@ define i32  @test_v4i8_s32(i32 %a) {
 }
 
 ; CHECK-LABEL: test_v4i8_u32
-; CHECK:            ld.param.u32    [[R:%r[0-9+]]], [test_v4i8_u32_param_0];
-; CHECK-DAG:        bfe.u32         [[R0:%r[0-9+]]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.u32         [[R1:%r[0-9+]]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.u32         [[R2:%r[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.u32         [[R3:%r[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9+]]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9+]]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
+; CHECK-DAG:        bfe.u32         [[R0:%r[0-9]+]], [[R]], 0, 8;
+; CHECK-DAG:        bfe.u32         [[R1:%r[0-9]+]], [[R]], 8, 8;
+; CHECK-DAG:        bfe.u32         [[R2:%r[0-9]+]], [[R]], 16, 8;
+; CHECK-DAG:        bfe.u32         [[R3:%r[0-9]+]], [[R]], 24, 8;
+; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
+; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_u32(i32 %a) {
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
@@ -95,23 +95,25 @@ define i32  @test_v4i8_u32(i32 %a) {
 
 
 ; CHECK-LABEL: test_v8i8
-; CHECK:       ld.param.u64    [[R:%rd[0-9+]]], [test_v8i8_param_0];
-; CHECK-DAG:        cvt.s8.s64      [[E0:%rs[0-9+]]], [[R]];
-; Element 1 is still extracted by trunc, shr 8, not sure why.
-; CHECK-DAG:        cvt.u16.u64     [[R01:%rs[0-9+]]], [[R]];
-; CHECK-DAG:        shr.s16         [[E1:%rs[0-9+]]], [[R01]], 8;
-; CHECK-DAG:        bfe.s64         [[RD2:%rd[0-9+]]], [[R]], 16, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E2:%rs[0-9+]]], [[RD2]];
-; CHECK-DAG:        bfe.s64         [[RD3:%rd[0-9+]]], [[R]], 24, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E3:%rs[0-9+]]], [[RD3]];
-; CHECK-DAG:        bfe.s64         [[RD4:%rd[0-9+]]], [[R]], 32, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E4:%rs[0-9+]]], [[RD4]];
-; CHECK-DAG:        bfe.s64         [[RD5:%rd[0-9+]]], [[R]], 40, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E5:%rs[0-9+]]], [[RD5]];
-; CHECK-DAG:        bfe.s64         [[RD6:%rd[0-9+]]], [[R]], 48, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E6:%rs[0-9+]]], [[RD6]];
-; CHECK-DAG:        bfe.s64         [[RD7:%rd[0-9+]]], [[R]], 56, 8;
-; CHECK-DAG:        cvt.s8.s64      [[E7:%rs[0-9+]]], [[RD7]];
+; CHECK:       ld.param.u64    [[R:%rd[0-9]+]], [test_v8i8_param_0];
+; CHECK-DAG:        cvt.u32.u64     [[R00:%r[0-9]+]], [[R]];
+; CHECK-DAG:        { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
+; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R00]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
+; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R00]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
+; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R00]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
+; CHECK-DAG:        bfe.s32         [[R4:%r[0-9]+]], [[R00]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E4:%rs[0-9]+]], [[R4]];
+; CHECK-DAG:        bfe.s32         [[R5:%r[0-9]+]], [[R01]], 0, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E5:%rs[0-9]+]], [[R5]];
+; CHECK-DAG:        bfe.s32         [[R6:%r[0-9]+]], [[R01]], 8, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E6:%rs[0-9]+]], [[R6]];
+; CHECK-DAG:        bfe.s32         [[R7:%r[0-9]+]], [[R01]], 16, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E7:%rs[0-9]+]], [[R7]];
+; CHECK-DAG:        bfe.s32         [[R8:%r[0-9]+]], [[R01]], 24, 8;
+; CHECK-DAG:        cvt.s8.s32      [[E8:%rs[0-9]+]], [[R8]];
 
 define i16  @test_v8i8(i64 %a) {
   %v = bitcast i64 %a to <8 x i8>
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 1ec68b4a271b..6895699a1dfe 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -106,29 +106,29 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -147,21 +147,21 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -180,21 +180,21 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -214,29 +214,29 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -251,31 +251,39 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.gt.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.gt.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.gt.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.gt.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -291,24 +299,24 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.hi.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.hi.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.hi.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.hi.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -321,31 +329,39 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.le.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.le.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.le.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.le.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -361,24 +377,24 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.ls.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.ls.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.ls.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.ls.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -397,27 +413,27 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -437,27 +453,27 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -476,29 +492,29 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -732,7 +748,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    st.u16 [%rd2], %r1;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
@@ -920,31 +936,31 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.s32 %r5, %r4, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r4, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r3, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r4, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r4, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r3, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.s32 %r11, %r4, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r4, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.s32 %r16, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r15, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r21, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
 ; CHECK-NEXT:    bfi.b32 %r23, %r22, %r19, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r24, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r25, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r24, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r25, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r26, %r25, %r24, %p1;
 ; CHECK-NEXT:    bfi.b32 %r27, %r26, %r23, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r27;
@@ -965,17 +981,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
 ; CHECK-NEXT:    ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    bfe.s32 %r11, %r10, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r9, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r10, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r9, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r10, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r9, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r14, %r13;
-; CHECK-NEXT:    bfe.s32 %r15, %r10, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r16, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r10, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r9, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r16, %r15;
-; CHECK-NEXT:    bfe.s32 %r17, %r10, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r9, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r10, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r9, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r18, %r17;
 ; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
 ; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
@@ -1004,19 +1020,19 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-NEXT:    setp.ne.s32 %p2, %r5, %r9;
 ; CHECK-NEXT:    setp.ne.s32 %p3, %r4, %r8;
 ; CHECK-NEXT:    setp.ne.s32 %p4, %r3, %r7;
-; CHECK-NEXT:    bfe.s32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r19, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
 ; CHECK-NEXT:    bfi.b32 %r21, %r20, %r17, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r22, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r23, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r24, %r23, %r22, %p1;
 ; CHECK-NEXT:    bfi.b32 %r25, %r24, %r21, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
@@ -1091,16 +1107,16 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
 ; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd3, %r3;
 ; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd5, %r4;
 ; CHECK-NEXT:    and.b64 %rd6, %rd5, 255;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd7, %r5;
 ; CHECK-NEXT:    and.b64 %rd8, %rd7, 255;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd8, %rd6};
@@ -1424,17 +1440,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 7e45b1feb7fd..e16028074f41 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
 
 ; CHECK-LABEL: test_fabsf(
 define float @test_fabsf(float %f) {
@@ -151,6 +151,28 @@ define i64 @test_globaltimer() {
   ret i64 %ret
 }
 
+; CHECK-LABEL: test_cyclecounter
+define i64 @test_cyclecounter() {
+; CHECK: mov.u64         %r{{.*}}, %clock64;
+  %a = tail call i64 @llvm.readcyclecounter()
+; CHECK: mov.u64         %r{{.*}}, %clock64;
+  %b = tail call i64 @llvm.readcyclecounter()
+  %ret = add i64 %a, %b
+; CHECK: ret
+  ret i64 %ret
+}
+
+; CHECK-LABEL: test_steadycounter
+define i64 @test_steadycounter() {
+; CHECK: mov.u64         %r{{.*}}, %globaltimer;
+  %a = tail call i64 @llvm.readsteadycounter()
+; CHECK: mov.u64         %r{{.*}}, %globaltimer;
+  %b = tail call i64 @llvm.readsteadycounter()
+  %ret = add i64 %a, %b
+; CHECK: ret
+  ret i64 %ret
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
@@ -166,3 +188,5 @@ declare i32 @llvm.nvvm.read.ptx.sreg.clock()
 declare i64 @llvm.nvvm.read.ptx.sreg.clock64()
 declare void @llvm.nvvm.exit()
 declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.readsteadycounter()
diff --git a/llvm/test/CodeGen/NVPTX/nanosleep.ll b/llvm/test/CodeGen/NVPTX/nanosleep.ll
index 1b2a7bf9476c..a28dabe771b0 100644
--- a/llvm/test/CodeGen/NVPTX/nanosleep.ll
+++ b/llvm/test/CodeGen/NVPTX/nanosleep.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
 
 declare void @llvm.nvvm.nanosleep(i32)
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
new file mode 100644
index 000000000000..0088d6c64205
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_52
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 -O0 | FileCheck %s --check-prefix=SM_70
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx72 -O0 | FileCheck %s --check-prefix=SM_90
+
+@.str = private unnamed_addr constant [12 x i8] c"__CUDA_ARCH\00"
+@.str1 = constant [11 x i8] c"__CUDA_FTZ\00"
+
+declare i32 @__nvvm_reflect(ptr)
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) foo()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @foo() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 900
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %return
+
+if.else:
+  %call1 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp2 = icmp uge i32 %call1, 700
+  br i1 %cmp2, label %if.then3, label %if.else4
+
+if.then3:
+  br label %return
+
+if.else4:
+  %call5 = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp6 = icmp uge i32 %call5, 520
+  br i1 %cmp6, label %if.then7, label %if.else8
+
+if.then7:
+  br label %return
+
+if.else8:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 1, %if.then ], [ 2, %if.then3 ], [ 3, %if.then7 ], [ 4, %if.else8 ]
+  ret i32 %retval.0
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_52: mov.b32         %[[REG:.+]], 2;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_70: mov.b32         %[[REG:.+]], 1;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) bar()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @bar() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %if.end
+
+if.else:
+  br label %if.end
+
+if.end:
+  %x = phi i32 [ 1, %if.then ], [ 2, %if.else ]
+  ret i32 %x
+}
+
+; SM_52-NOT: valid;
+; SM_70: valid;
+; SM_90: valid;
+define void @baz() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %cmp = icmp uge i32 %call, 700
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void asm sideeffect "valid;\0A", ""()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) qux()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @qux() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr noundef @.str)
+  switch i32 %call, label %sw.default [
+    i32 900, label %sw.bb
+    i32 700, label %sw.bb1
+    i32 520, label %sw.bb2
+  ]
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ 4, %sw.default ], [ 3, %sw.bb2 ], [ 2, %sw.bb1 ], [ 1, %sw.bb ]
+  ret i32 %retval
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_52: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_52-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_52-NEXT: ret;
+;      SM_70: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_70: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_70-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_70-NEXT: ret;
+;      SM_90: .visible .func  (.param .b32 func_retval0) phi()
+;      SM_90: mov.f32         %[[REG:.+]], 0f00000000;
+; SM_90-NEXT: st.param.f32    [func_retval0+0], %[[REG]];
+; SM_90-NEXT: ret;
+define float @phi() {
+entry:
+  %0 = call i32 @__nvvm_reflect(ptr @.str)
+  %1 = icmp eq i32 %0, 0
+  br i1 %1, label %if.then, label %if.else
+
+if.then:
+  br label %if.else
+
+if.else:
+  %.08 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ]
+  %4 = fcmp ogt float %.08, 0.000000e+00
+  br i1 %4, label %exit, label %if.exit
+
+if.exit:
+  br label %exit
+
+exit:
+  ret float 0.000000e+00
+}
+
+;      SM_52: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_52: mov.b32         %[[REG:.+]], 3;
+; SM_52-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_52-NEXT: ret;
+;
+;      SM_70: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_70: mov.b32         %[[REG:.+]], 2;
+; SM_70-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_70-NEXT: ret;
+;
+;      SM_90: .visible .func  (.param .b32 func_retval0) prop()
+;      SM_90: mov.b32         %[[REG:.+]], 1;
+; SM_90-NEXT: st.param.b32    [func_retval0+0], %[[REG:.+]];
+; SM_90-NEXT: ret;
+define i32 @prop() {
+entry:
+  %call = call i32 @__nvvm_reflect(ptr @.str)
+  %conv = zext i32 %call to i64
+  %div = udiv i64 %conv, 100
+  %cmp = icmp eq i64 %div, 9
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %return
+
+if.else:
+  %div2 = udiv i64 %conv, 100
+  %cmp3 = icmp eq i64 %div2, 7
+  br i1 %cmp3, label %if.then5, label %if.else6
+
+if.then5:
+  br label %return
+
+if.else6:
+  %div7 = udiv i64 %conv, 100
+  %cmp8 = icmp eq i64 %div7, 5
+  br i1 %cmp8, label %if.then10, label %if.else11
+
+if.then10:
+  br label %return
+
+if.else11:
+  br label %return
+
+return:
+  %retval = phi i32 [ 1, %if.then ], [ 2, %if.then5 ], [ 3, %if.then10 ], [ 4, %if.else11 ]
+  ret i32 %retval
+}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index e8c554c9ed52..ac5875c6ab10 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -18,4 +18,3 @@ define i32 @foo(float %a, float %b) {
 ; SM35: ret i32 350  
   ret i32 %reflect
 }
-
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 1cb5c87fae82..46ab79d9858c 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -41,7 +41,7 @@ exit:
   ret float %ret
 }
 
-declare i32 @llvm.nvvm.reflect.p0(ptr)
+declare i32 @llvm.nvvm.reflect(ptr)
 
 ; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
@@ -49,7 +49,7 @@ define i32 @intrinsic() {
 ; USE_FTZ_0: ret i32 0
 ; USE_FTZ_1: ret i32 1
   %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
-  %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr)
+  %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr)
   ret i32 %reflect
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 9b1939f37208..2ed9f7c11bcf 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -41,7 +41,7 @@ exit:
   ret float %ret
 }
 
-declare i32 @llvm.nvvm.reflect.p0(ptr)
+declare i32 @llvm.nvvm.reflect(ptr)
 
 ; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
@@ -49,7 +49,7 @@ define i32 @intrinsic() {
 ; USE_FTZ_0: ret i32 0
 ; USE_FTZ_1: ret i32 1
   %ptr = tail call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) @str)
-  %reflect = tail call i32 @llvm.nvvm.reflect.p0(ptr %ptr)
+  %reflect = tail call i32 @llvm.nvvm.reflect(ptr %ptr)
   ret i32 %reflect
 }
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
index f4a098dd28c4..d14735046516 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
@@ -220,25 +220,28 @@ body:             |
     ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[LSHR3]], [[C5]]
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[AND5]]
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1085102592571150096
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[C8]]
+    ; CHECK-NEXT: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.2
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[OR6]], [[LOAD]]
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[AND6]], [[C7]](s64)
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR6]], [[C7]](s64)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[C8]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SHL4]], [[LOAD]]
     ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[AND7]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 -3689348814741910324
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[C10]]
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C9]](s64)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C9]](s64)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[C10]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[CONSTANT_POOL1:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.1
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL1]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[OR7]], [[LOAD1]]
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[AND8]], [[C8]](s64)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C8]](s64)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SHL5]], [[LOAD1]]
     ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[AND9]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 -6148914691236517206
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[C12]]
-    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C11]](s64)
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C11]](s64)
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[C12]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[CONSTANT_POOL2:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL2]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[OR8]], [[LOAD2]]
+    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND10]], [[C9]](s64)
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C9]](s64)
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[SHL6]], [[LOAD2]]
     ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[AND11]]
     ; CHECK-NEXT: $x10 = COPY [[OR9]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
index fa57295d3a54..6389fd6fcd07 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
@@ -6,8 +6,9 @@ name:            const_i8
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i8
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -127
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -127
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s8) = G_CONSTANT i8 129
     %1:_(s64) = G_ANYEXT %0(s8)
@@ -20,8 +21,9 @@ name:            const_i15
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i15
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s15) = G_CONSTANT i15 15
     %1:_(s64) = G_ANYEXT %0(s15)
@@ -34,8 +36,9 @@ name:            const_i16
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i16
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 767
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 767
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s16) = G_CONSTANT i16 -64769
     %1:_(s64) = G_ANYEXT %0(s16)
@@ -48,8 +51,9 @@ name:            const_i32
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: const_i32
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769
-    ; CHECK-NEXT: $x10 = COPY [[C]](s64)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -64769
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = G_CONSTANT i32 -64769
     %1:_(s64) = G_ANYEXT %0(s32)
@@ -180,3 +184,19 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+
+...
+---
+name:            constant_pool_i64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constant_pool_i64
+    ; CHECK: [[CONSTANT_POOL:%[0-9]+]]:_(p0) = G_CONSTANT_POOL %const.0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[CONSTANT_POOL]](p0) :: (load (s64) from constant-pool)
+    ; CHECK-NEXT: $x10 = COPY [[LOAD]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = G_CONSTANT i64 -1085102592571150096
+    $x10 = COPY %0(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index e7db8ef9d5af..364c1e430b91 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -68,6 +68,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
+; CHECK-NEXT:       Type Promotion
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 710b602df6d0..5e8ed4509b53 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -150,24 +150,24 @@ define void @callee() nounwind {
 ;
 ; RV32I-ILP32E-LABEL: callee:
 ; RV32I-ILP32E:       # %bb.0:
-; RV32I-ILP32E-NEXT:    addi sp, sp, -48
-; RV32I-ILP32E-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    addi sp, sp, -36
+; RV32I-ILP32E-NEXT:    sw ra, 32(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lui a6, %hi(var)
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a6)
-; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    addi a5, a6, %lo(var)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(a5)
-; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, 20(a5)
-; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT:    sw a0, 0(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw t0, 24(a5)
 ; RV32I-ILP32E-NEXT:    lw t1, 28(a5)
 ; RV32I-ILP32E-NEXT:    lw t2, 32(a5)
@@ -220,22 +220,22 @@ define void @callee() nounwind {
 ; RV32I-ILP32E-NEXT:    sw t2, 32(a5)
 ; RV32I-ILP32E-NEXT:    sw t1, 28(a5)
 ; RV32I-ILP32E-NEXT:    sw t0, 24(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 20(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 16(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a6)
-; RV32I-ILP32E-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a6)
-; RV32I-ILP32E-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    addi sp, sp, 48
+; RV32I-ILP32E-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT:    addi sp, sp, 36
 ; RV32I-ILP32E-NEXT:    ret
 ;
 ; RV32I-WITH-FP-LABEL: callee:
@@ -659,24 +659,24 @@ define void @callee() nounwind {
 ;
 ; RV64I-LP64E-LABEL: callee:
 ; RV64I-LP64E:       # %bb.0:
-; RV64I-LP64E-NEXT:    addi sp, sp, -80
-; RV64I-LP64E-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    sd s1, 56(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    addi sp, sp, -72
+; RV64I-LP64E-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lui a6, %hi(var)
 ; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a6)
-; RV64I-LP64E-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a6)
 ; RV64I-LP64E-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    addi a5, a6, %lo(var)
 ; RV64I-LP64E-NEXT:    lw a0, 16(a5)
-; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, 20(a5)
 ; RV64I-LP64E-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT:    lw a0, 20(a5)
+; RV64I-LP64E-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lw t0, 24(a5)
 ; RV64I-LP64E-NEXT:    lw t1, 28(a5)
 ; RV64I-LP64E-NEXT:    lw t2, 32(a5)
@@ -729,22 +729,22 @@ define void @callee() nounwind {
 ; RV64I-LP64E-NEXT:    sw t2, 32(a5)
 ; RV64I-LP64E-NEXT:    sw t1, 28(a5)
 ; RV64I-LP64E-NEXT:    sw t0, 24(a5)
-; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 20(a5)
-; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 16(a5)
-; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a6)
-; RV64I-LP64E-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a6)
-; RV64I-LP64E-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    ld s1, 56(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    addi sp, sp, 80
+; RV64I-LP64E-NEXT:    ld ra, 64(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT:    addi sp, sp, 72
 ; RV64I-LP64E-NEXT:    ret
 ;
 ; RV64I-WITH-FP-LABEL: callee:
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index 0eb63913be88..5c55113fc974 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -151,20 +151,20 @@ define i32 @callee_float_on_stack(i64 %a, i64 %b, i64 %c, i64 %d, float %e) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_float_on_stack:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 8(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 0(s0)
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_float_on_stack:
@@ -298,18 +298,18 @@ define float @callee_tiny_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_tiny_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a0, 260096
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_tiny_scalar_ret:
@@ -543,13 +543,13 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ;
 ; ILP32E-WITHFP-LABEL: callee_aligned_stack:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a2)
 ; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
@@ -562,9 +562,9 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a4, a5, a4
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a4
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_aligned_stack:
@@ -847,19 +847,19 @@ define double @callee_small_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a1, 261888
 ; ILP32E-WITHFP-NEXT:    li a0, 0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_scalar_ret:
@@ -944,18 +944,18 @@ define i32 @callee_i64_in_regs(i32 %a, i64 %b) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_i64_in_regs:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_i64_in_regs:
@@ -1066,13 +1066,13 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ;
 ; ILP32E-WITHFP-LABEL: callee_many_scalars:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a6, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a7, 0(s0)
@@ -1091,9 +1091,9 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_many_scalars:
@@ -1287,13 +1287,13 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalars:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a2, 0(a1)
 ; ILP32E-WITHFP-NEXT:    lw a3, 0(a0)
@@ -1311,9 +1311,9 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    or a0, a0, a4
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars:
@@ -1514,13 +1514,13 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(s0)
@@ -1540,9 +1540,9 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    or a0, a0, a4
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs:
@@ -1872,19 +1872,19 @@ define i32 @callee_small_coerced_struct([2 x i32] %a.coerce) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_coerced_struct:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_coerced_struct:
@@ -1983,20 +1983,20 @@ define i32 @callee_large_struct(ptr byval(%struct.large) align 4 %a) {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_struct:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a1, 0(a0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct:
@@ -2153,19 +2153,19 @@ define %struct.small @callee_small_struct_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_small_struct_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 0
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_small_struct_ret:
@@ -2260,22 +2260,22 @@ define fp128 @callee_large_scalar_ret() {
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_scalar_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lui a1, 524272
 ; ILP32E-WITHFP-NEXT:    sw a1, 12(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(a0)
 ; ILP32E-WITHFP-NEXT:    sw zero, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalar_ret:
@@ -2395,13 +2395,13 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ;
 ; ILP32E-WITHFP-LABEL: callee_large_struct_ret:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 16
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -8
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 8
+; ILP32E-WITHFP-NEXT:    sw ra, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 0(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -4
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    li a1, 1
 ; ILP32E-WITHFP-NEXT:    sw a1, 0(a0)
@@ -2411,9 +2411,9 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ; ILP32E-WITHFP-NEXT:    sw a1, 8(a0)
 ; ILP32E-WITHFP-NEXT:    li a1, 4
 ; ILP32E-WITHFP-NEXT:    sw a1, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 16
+; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 8
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_struct_ret:
diff --git a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
index 9e7f2e9525d3..6e3a50542939 100644
--- a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
@@ -254,21 +254,39 @@ define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ultcmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ultcmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ultcmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ultcmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ultcmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ultcmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ult i16 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
@@ -421,21 +439,39 @@ define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ulecmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ulecmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ulecmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ulecmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ulecmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ulecmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ulecmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, -257 ; ~0U << 8 - 1
   ret i1 %tmp1
diff --git a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
index ca6895dc51fa..5edf3cf49e25 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-opt-crash.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mattr=+zcmp -verify-machineinstrs  \
-; RUN: -mtriple=riscv32 -target-abi ilp32 < %s \
+; RUN: -mtriple=riscv32 -target-abi=ilp32 < %s \
 ; RUN: | FileCheck %s -check-prefixes=RV32IZCMP
 ; RUN: llc -mattr=+zcmp -verify-machineinstrs  \
-; RUN: -mtriple=riscv64 -target-abi ilp64 < %s \
+; RUN: -mtriple=riscv64 -target-abi=lp64 < %s \
 ; RUN: | FileCheck %s -check-prefixes=RV64IZCMP
 
 ; This source code exposed a crash in the RISC-V Zcmp Push/Pop optimization
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 09a91498a15d..e007dcc764e9 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -3218,3 +3218,74 @@ entry:
   call void @bar(ptr %0, ptr %var)
   ret i32 %x
 }
+
+define void @spill_x10() {
+; RV32IZCMP-LABEL: spill_x10:
+; RV32IZCMP:       # %bb.0: # %entry
+; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -64
+; RV32IZCMP-NEXT:    .cfi_def_cfa_offset 64
+; RV32IZCMP-NEXT:    .cfi_offset s10, -8
+; RV32IZCMP-NEXT:    #APP
+; RV32IZCMP-NEXT:    li s10, 0
+; RV32IZCMP-NEXT:    #NO_APP
+; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 64
+;
+; RV64IZCMP-LABEL: spill_x10:
+; RV64IZCMP:       # %bb.0: # %entry
+; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -112
+; RV64IZCMP-NEXT:    .cfi_def_cfa_offset 112
+; RV64IZCMP-NEXT:    .cfi_offset s10, -16
+; RV64IZCMP-NEXT:    #APP
+; RV64IZCMP-NEXT:    li s10, 0
+; RV64IZCMP-NEXT:    #NO_APP
+; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 112
+;
+; RV32IZCMP-SR-LABEL: spill_x10:
+; RV32IZCMP-SR:       # %bb.0: # %entry
+; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -64
+; RV32IZCMP-SR-NEXT:    .cfi_def_cfa_offset 64
+; RV32IZCMP-SR-NEXT:    .cfi_offset s10, -8
+; RV32IZCMP-SR-NEXT:    #APP
+; RV32IZCMP-SR-NEXT:    li s10, 0
+; RV32IZCMP-SR-NEXT:    #NO_APP
+; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 64
+;
+; RV64IZCMP-SR-LABEL: spill_x10:
+; RV64IZCMP-SR:       # %bb.0: # %entry
+; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -112
+; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 112
+; RV64IZCMP-SR-NEXT:    .cfi_offset s10, -16
+; RV64IZCMP-SR-NEXT:    #APP
+; RV64IZCMP-SR-NEXT:    li s10, 0
+; RV64IZCMP-SR-NEXT:    #NO_APP
+; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 112
+;
+; RV32I-LABEL: spill_x10:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw s10, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset s10, -4
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    li s10, 0
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    lw s10, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: spill_x10:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset s10, -8
+; RV64I-NEXT:    #APP
+; RV64I-NEXT:    li s10, 0
+; RV64I-NEXT:    #NO_APP
+; RV64I-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+entry:
+  tail call void asm sideeffect "li s10, 0", "~{s10}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
index 2fb674f56080..7fe67a07255d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/vararg.ll
@@ -76,28 +76,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; LP64E-FPELIM-LABEL: va1:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 28
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 20
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    .cfi_offset ra, -56
 ; LP64E-WITHFP-NEXT:    .cfi_offset s0, -64
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa s0, 48
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
@@ -107,9 +107,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 12
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -161,24 +161,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va1_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -187,9 +187,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -435,24 +435,24 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 39
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 31
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -461,9 +461,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 23
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -521,24 +521,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -547,9 +547,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -654,23 +654,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 31
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 23
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -678,9 +678,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 15
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -737,23 +737,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 24
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 16
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -761,9 +761,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 8
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1208,24 +1208,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va6_no_fixed_args:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a0, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 24
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 16
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va6_no_fixed_args:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
@@ -1234,9 +1234,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a0, 0(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 8
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index c8531ed1f7cf..af7d7f7ae755 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -588,3 +588,39 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
   %v3 = insertelement <8 x i32> %v2, i32 %e3, i32 7
   ret <8 x i32> %v3
 }
+
+; Here we can not pull the ashr through into the vector domain due to
+; the truncate semantics of the build_vector.  Doing so would
+; truncate before the ashr instead of after it, so if %a or %b
+; is e.g. UINT32_MAX+1 we get different result.
+define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
+; RV32-LABEL: build_vec_of_trunc_op:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    slli a3, a3, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: build_vec_of_trunc_op:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    srli a1, a1, 1
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    ret
+entry:
+  %conv11.i = ashr i64 %a, 1
+  %conv11.2 = ashr i64 %b, 1
+  %0 = trunc i64 %conv11.i to i32
+  %1 = trunc i64 %conv11.2 to i32
+  %2 = insertelement <2 x i32> zeroinitializer, i32 %0, i64 0
+  %3 = insertelement <2 x i32> %2, i32 %1, i64 1
+  ret <2 x i32> %3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index 8b51a3883042..c49b1a7ad186 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -76,10 +76,8 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
@@ -101,8 +99,8 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
@@ -172,10 +170,8 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
@@ -197,8 +193,8 @@ define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
 ; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
@@ -234,39 +230,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vse32.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -333,9 +349,7 @@ define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
 ;
 ; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
 ; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
 ; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
@@ -611,9 +625,8 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
 ; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
 ; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    li a0, 42
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
 ; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
@@ -741,51 +754,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    li a1, 42
-; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    li a1, 42
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v9, 0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-V-NEXT:    vmv.v.v v9, v8
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-V-NEXT:    vsm.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.v.i v9, 0
+; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-V-NEXT:    vmv.v.v v9, v8
+; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-V-NEXT:    vsm.v v8, (a0)
+; CHECK-V-NEXT:    ret
+;
+; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
+; CHECK-KNOWNVLEN128:       # %bb.0:
+; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
+; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
+; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
+; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
+; CHECK-KNOWNVLEN128-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
@@ -804,6 +857,16 @@ define void @extract_v8i1_nxv32i1_16(<vscale x 32 x i1> %x, ptr %y) {
   ret void
 }
 
+define <1 x i64> @extract_v1i64_v2i64_1(<2 x i64> %x) {
+; CHECK-LABEL: extract_v1i64_v2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %v = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %x, i64 1)
+  ret <1 x i64> %v
+}
+
 declare <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %vec, i64 %idx)
 declare <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %vec, i64 %idx)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 57b219343c3e..a2bd862e2ce1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1394,3 +1394,77 @@ define <2 x double> @vid_step2_v2f64() {
 ; CHECK-NEXT:    ret
   ret <2 x double> <double 0.0, double 2.0>
 }
+
+
+define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float %e3, float %e4, float %e5, float %e6, float %e7) vscale_range(4, 128) {
+; CHECK-LABEL: buildvec_v8f32_zvl256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x float> poison, float %e0, i64 0
+  %v1 = insertelement <8 x float> %v0, float %e1, i64 1
+  %v2 = insertelement <8 x float> %v1, float %e2, i64 2
+  %v3 = insertelement <8 x float> %v2, float %e3, i64 3
+  %v4 = insertelement <8 x float> %v3, float %e4, i64 4
+  %v5 = insertelement <8 x float> %v4, float %e5, i64 5
+  %v6 = insertelement <8 x float> %v5, float %e6, i64 6
+  %v7 = insertelement <8 x float> %v6, float %e7, i64 7
+  ret <8 x float> %v7
+}
+
+
+define <8 x double> @buildvec_v8f64_zvl256(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(4, 128) {
+; CHECK-LABEL: buildvec_v8f64_zvl256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x double> poison, double %e0, i64 0
+  %v1 = insertelement <8 x double> %v0, double %e1, i64 1
+  %v2 = insertelement <8 x double> %v1, double %e2, i64 2
+  %v3 = insertelement <8 x double> %v2, double %e3, i64 3
+  %v4 = insertelement <8 x double> %v3, double %e4, i64 4
+  %v5 = insertelement <8 x double> %v4, double %e5, i64 5
+  %v6 = insertelement <8 x double> %v5, double %e6, i64 6
+  %v7 = insertelement <8 x double> %v6, double %e7, i64 7
+  ret <8 x double> %v7
+}
+
+define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(8, 128) {
+; CHECK-LABEL: buildvec_v8f64_zvl512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT:    ret
+  %v0 = insertelement <8 x double> poison, double %e0, i64 0
+  %v1 = insertelement <8 x double> %v0, double %e1, i64 1
+  %v2 = insertelement <8 x double> %v1, double %e2, i64 2
+  %v3 = insertelement <8 x double> %v2, double %e3, i64 3
+  %v4 = insertelement <8 x double> %v3, double %e4, i64 4
+  %v5 = insertelement <8 x double> %v4, double %e5, i64 5
+  %v6 = insertelement <8 x double> %v5, double %e6, i64 6
+  %v7 = insertelement <8 x double> %v6, double %e7, i64 7
+  ret <8 x double> %v7
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index dfafbfb97284..e691e6358115 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -1178,3 +1178,512 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
   %v4 = insertelement <8 x i64> %v3, i64 %d, i32 7
   ret <8 x i64> %v4
 }
+
+
+define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
+; RV32-LABEL: buildvec_v16i8_loads_contigous:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    lbu a1, 1(a0)
+; RV32-NEXT:    lbu a2, 2(a0)
+; RV32-NEXT:    lbu a3, 3(a0)
+; RV32-NEXT:    lbu a4, 4(a0)
+; RV32-NEXT:    lbu a5, 5(a0)
+; RV32-NEXT:    lbu a6, 6(a0)
+; RV32-NEXT:    lbu a7, 7(a0)
+; RV32-NEXT:    lbu t0, 8(a0)
+; RV32-NEXT:    lbu t1, 9(a0)
+; RV32-NEXT:    lbu t2, 10(a0)
+; RV32-NEXT:    lbu t3, 11(a0)
+; RV32-NEXT:    lbu t4, 12(a0)
+; RV32-NEXT:    lbu t5, 13(a0)
+; RV32-NEXT:    lbu t6, 14(a0)
+; RV32-NEXT:    lbu s0, 15(a0)
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vlse8.v v8, (a0), zero
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, t0
+; RV32-NEXT:    vslide1down.vx v8, v8, t1
+; RV32-NEXT:    vslide1down.vx v8, v8, t2
+; RV32-NEXT:    vslide1down.vx v8, v8, t3
+; RV32-NEXT:    vslide1down.vx v8, v8, t4
+; RV32-NEXT:    vslide1down.vx v8, v8, t5
+; RV32-NEXT:    vslide1down.vx v8, v8, t6
+; RV32-NEXT:    vslide1down.vx v8, v8, s0
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: buildvec_v16i8_loads_contigous:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    lbu a1, 1(a0)
+; RV64-NEXT:    lbu a2, 2(a0)
+; RV64-NEXT:    lbu a3, 3(a0)
+; RV64-NEXT:    lbu a4, 4(a0)
+; RV64-NEXT:    lbu a5, 5(a0)
+; RV64-NEXT:    lbu a6, 6(a0)
+; RV64-NEXT:    lbu a7, 7(a0)
+; RV64-NEXT:    lbu t0, 8(a0)
+; RV64-NEXT:    lbu t1, 9(a0)
+; RV64-NEXT:    lbu t2, 10(a0)
+; RV64-NEXT:    lbu t3, 11(a0)
+; RV64-NEXT:    lbu t4, 12(a0)
+; RV64-NEXT:    lbu t5, 13(a0)
+; RV64-NEXT:    lbu t6, 14(a0)
+; RV64-NEXT:    lbu s0, 15(a0)
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vlse8.v v8, (a0), zero
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    vslide1down.vx v8, v8, a5
+; RV64-NEXT:    vslide1down.vx v8, v8, a6
+; RV64-NEXT:    vslide1down.vx v8, v8, a7
+; RV64-NEXT:    vslide1down.vx v8, v8, t0
+; RV64-NEXT:    vslide1down.vx v8, v8, t1
+; RV64-NEXT:    vslide1down.vx v8, v8, t2
+; RV64-NEXT:    vslide1down.vx v8, v8, t3
+; RV64-NEXT:    vslide1down.vx v8, v8, t4
+; RV64-NEXT:    vslide1down.vx v8, v8, t5
+; RV64-NEXT:    vslide1down.vx v8, v8, t6
+; RV64-NEXT:    vslide1down.vx v8, v8, s0
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 2
+  %p4 = getelementptr i8, ptr %p, i32 3
+  %p5 = getelementptr i8, ptr %p, i32 4
+  %p6 = getelementptr i8, ptr %p, i32 5
+  %p7 = getelementptr i8, ptr %p, i32 6
+  %p8 = getelementptr i8, ptr %p, i32 7
+  %p9 = getelementptr i8, ptr %p, i32 8
+  %p10 = getelementptr i8, ptr %p, i32 9
+  %p11 = getelementptr i8, ptr %p, i32 10
+  %p12 = getelementptr i8, ptr %p, i32 11
+  %p13 = getelementptr i8, ptr %p, i32 12
+  %p14 = getelementptr i8, ptr %p, i32 13
+  %p15 = getelementptr i8, ptr %p, i32 14
+  %p16 = getelementptr i8, ptr %p, i32 15
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+
+define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
+; RV32-LABEL: buildvec_v16i8_loads_gather:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    lbu a1, 1(a0)
+; RV32-NEXT:    lbu a2, 22(a0)
+; RV32-NEXT:    lbu a3, 31(a0)
+; RV32-NEXT:    lbu a4, 44(a0)
+; RV32-NEXT:    lbu a5, 55(a0)
+; RV32-NEXT:    lbu a6, 623(a0)
+; RV32-NEXT:    lbu a7, 75(a0)
+; RV32-NEXT:    lbu t0, 82(a0)
+; RV32-NEXT:    lbu t1, 93(a0)
+; RV32-NEXT:    lbu t2, 105(a0)
+; RV32-NEXT:    lbu t3, 161(a0)
+; RV32-NEXT:    lbu t4, 124(a0)
+; RV32-NEXT:    lbu t5, 163(a0)
+; RV32-NEXT:    lbu t6, 144(a0)
+; RV32-NEXT:    lbu s0, 154(a0)
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vlse8.v v8, (a0), zero
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslide1down.vx v8, v8, t0
+; RV32-NEXT:    vslide1down.vx v8, v8, t1
+; RV32-NEXT:    vslide1down.vx v8, v8, t2
+; RV32-NEXT:    vslide1down.vx v8, v8, t3
+; RV32-NEXT:    vslide1down.vx v8, v8, t4
+; RV32-NEXT:    vslide1down.vx v8, v8, t5
+; RV32-NEXT:    vslide1down.vx v8, v8, t6
+; RV32-NEXT:    vslide1down.vx v8, v8, s0
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: buildvec_v16i8_loads_gather:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    lbu a1, 1(a0)
+; RV64-NEXT:    lbu a2, 22(a0)
+; RV64-NEXT:    lbu a3, 31(a0)
+; RV64-NEXT:    lbu a4, 44(a0)
+; RV64-NEXT:    lbu a5, 55(a0)
+; RV64-NEXT:    lbu a6, 623(a0)
+; RV64-NEXT:    lbu a7, 75(a0)
+; RV64-NEXT:    lbu t0, 82(a0)
+; RV64-NEXT:    lbu t1, 93(a0)
+; RV64-NEXT:    lbu t2, 105(a0)
+; RV64-NEXT:    lbu t3, 161(a0)
+; RV64-NEXT:    lbu t4, 124(a0)
+; RV64-NEXT:    lbu t5, 163(a0)
+; RV64-NEXT:    lbu t6, 144(a0)
+; RV64-NEXT:    lbu s0, 154(a0)
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vlse8.v v8, (a0), zero
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a4
+; RV64-NEXT:    vslide1down.vx v8, v8, a5
+; RV64-NEXT:    vslide1down.vx v8, v8, a6
+; RV64-NEXT:    vslide1down.vx v8, v8, a7
+; RV64-NEXT:    vslide1down.vx v8, v8, t0
+; RV64-NEXT:    vslide1down.vx v8, v8, t1
+; RV64-NEXT:    vslide1down.vx v8, v8, t2
+; RV64-NEXT:    vslide1down.vx v8, v8, t3
+; RV64-NEXT:    vslide1down.vx v8, v8, t4
+; RV64-NEXT:    vslide1down.vx v8, v8, t5
+; RV64-NEXT:    vslide1down.vx v8, v8, t6
+; RV64-NEXT:    vslide1down.vx v8, v8, s0
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_low_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 93(a0)
+; CHECK-NEXT:    lbu a3, 105(a0)
+; CHECK-NEXT:    lbu a4, 161(a0)
+; CHECK-NEXT:    lbu a5, 124(a0)
+; CHECK-NEXT:    lbu a6, 163(a0)
+; CHECK-NEXT:    lbu a7, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v9 = insertelement <16 x i8> poison, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 %ld14, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
+
+define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_high_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lbu a1, 1(a0)
+; CHECK-NEXT:    lbu a2, 22(a0)
+; CHECK-NEXT:    lbu a3, 31(a0)
+; CHECK-NEXT:    lbu a4, 44(a0)
+; CHECK-NEXT:    lbu a5, 55(a0)
+; CHECK-NEXT:    lbu a6, 623(a0)
+; CHECK-NEXT:    lbu a7, 75(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 8
+; CHECK-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 %ld3, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  ret <16 x i8> %v8
+}
+
+define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_undef_edges:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, 31
+; CHECK-NEXT:    lbu a2, 44(a0)
+; CHECK-NEXT:    lbu a3, 55(a0)
+; CHECK-NEXT:    lbu a4, 623(a0)
+; CHECK-NEXT:    lbu a5, 75(a0)
+; CHECK-NEXT:    lbu a6, 82(a0)
+; CHECK-NEXT:    lbu a7, 93(a0)
+; CHECK-NEXT:    lbu t0, 105(a0)
+; CHECK-NEXT:    lbu a0, 161(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+
+  %v4 = insertelement <16 x i8> poison, i8 %ld4, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 %ld7, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 %ld11, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 %ld12, i32 11
+  ret <16 x i8> %v12
+}
+
+define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
+; CHECK-LABEL: buildvec_v16i8_loads_undef_scattered:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lbu a1, 1(a0)
+; CHECK-NEXT:    lbu a2, 44(a0)
+; CHECK-NEXT:    lbu a3, 55(a0)
+; CHECK-NEXT:    lbu a4, 75(a0)
+; CHECK-NEXT:    lbu a5, 82(a0)
+; CHECK-NEXT:    lbu a6, 93(a0)
+; CHECK-NEXT:    lbu a7, 124(a0)
+; CHECK-NEXT:    lbu t0, 144(a0)
+; CHECK-NEXT:    lbu t1, 154(a0)
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a6
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    ret
+  %p2 = getelementptr i8, ptr %p, i32 1
+  %p3 = getelementptr i8, ptr %p, i32 22
+  %p4 = getelementptr i8, ptr %p, i32 31
+  %p5 = getelementptr i8, ptr %p, i32 44
+  %p6 = getelementptr i8, ptr %p, i32 55
+  %p7 = getelementptr i8, ptr %p, i32 623
+  %p8 = getelementptr i8, ptr %p, i32 75
+  %p9 = getelementptr i8, ptr %p, i32 82
+  %p10 = getelementptr i8, ptr %p, i32 93
+  %p11 = getelementptr i8, ptr %p, i32 105
+  %p12 = getelementptr i8, ptr %p, i32 161
+  %p13 = getelementptr i8, ptr %p, i32 124
+  %p14 = getelementptr i8, ptr %p, i32 163
+  %p15 = getelementptr i8, ptr %p, i32 144
+  %p16 = getelementptr i8, ptr %p, i32 154
+
+  %ld1 = load i8, ptr %p
+  %ld2 = load i8, ptr %p2
+  %ld3 = load i8, ptr %p3
+  %ld4 = load i8, ptr %p4
+  %ld5 = load i8, ptr %p5
+  %ld6 = load i8, ptr %p6
+  %ld7 = load i8, ptr %p7
+  %ld8 = load i8, ptr %p8
+  %ld9 = load i8, ptr %p9
+  %ld10 = load i8, ptr %p10
+  %ld11 = load i8, ptr %p11
+  %ld12 = load i8, ptr %p12
+  %ld13 = load i8, ptr %p13
+  %ld14 = load i8, ptr %p14
+  %ld15 = load i8, ptr %p15
+  %ld16 = load i8, ptr %p16
+
+  %v1 = insertelement <16 x i8> poison, i8 %ld1, i32 0
+  %v2 = insertelement <16 x i8> %v1, i8 %ld2, i32 1
+  %v3 = insertelement <16 x i8> %v2, i8 undef, i32 2
+  %v4 = insertelement <16 x i8> %v3, i8 undef, i32 3
+  %v5 = insertelement <16 x i8> %v4, i8 %ld5, i32 4
+  %v6 = insertelement <16 x i8> %v5, i8 %ld6, i32 5
+  %v7 = insertelement <16 x i8> %v6, i8 undef, i32 6
+  %v8 = insertelement <16 x i8> %v7, i8 %ld8, i32 7
+  %v9 = insertelement <16 x i8> %v8, i8 %ld9, i32 8
+  %v10 = insertelement <16 x i8> %v9, i8 %ld10, i32 9
+  %v11 = insertelement <16 x i8> %v10, i8 undef, i32 10
+  %v12 = insertelement <16 x i8> %v11, i8 undef, i32 11
+  %v13 = insertelement <16 x i8> %v12, i8 %ld13, i32 12
+  %v14 = insertelement <16 x i8> %v13, i8 undef, i32 13
+  %v15 = insertelement <16 x i8> %v14, i8 %ld15, i32 14
+  %v16 = insertelement <16 x i8> %v15, i8 %ld16, i32 15
+  ret <16 x i8> %v16
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index acad71bb5959..0e8d9cf03066 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -722,97 +722,25 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
 
 ; FIXME: This could be expressed as a vrgather.vv
 define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
-; RV32-LABEL: shuffle_v64i8_v8i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    li a0, 64
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vse8.v v8, (a1)
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 8
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 16
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 24
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    lbu a0, 32(sp)
-; RV32-NEXT:    lbu a1, 40(sp)
-; RV32-NEXT:    lbu a2, 48(sp)
-; RV32-NEXT:    lbu a3, 56(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_v64i8_v8i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    mv a1, sp
-; RV64-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64-NEXT:    vse8.v v8, (a1)
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 8
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 16
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 24
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    lbu a0, 32(sp)
-; RV64-NEXT:    lbu a1, 40(sp)
-; RV64-NEXT:    lbu a2, 48(sp)
-; RV64-NEXT:    lbu a3, 56(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_v64i8_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vsll.vi v14, v12, 3
+; CHECK-NEXT:    vrgather.vv v12, v8, v14
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    li a1, 240
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 98561
+; CHECK-NEXT:    addi a1, a1, -2048
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vrgather.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
   %s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
   ret <8 x i8> %s
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index df41ac10f80d..890707c6337f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -14638,5 +14638,453 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %allones, <8 x i16> poison)
   ret <8 x i16> %v
 }
+
+; v32i64 is not a legal type, so make sure we don't try to combine the mgather
+; to a vlse intrinsic until it is legalized and split.
+define <32 x i64> @mgather_strided_split(ptr %base) {
+; RV32V-LABEL: mgather_strided_split:
+; RV32V:       # %bb.0:
+; RV32V-NEXT:    li a1, 16
+; RV32V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32V-NEXT:    vlse64.v v8, (a0), a1
+; RV32V-NEXT:    addi a0, a0, 256
+; RV32V-NEXT:    vlse64.v v16, (a0), a1
+; RV32V-NEXT:    ret
+;
+; RV64V-LABEL: mgather_strided_split:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 16
+; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64V-NEXT:    vlse64.v v8, (a0), a1
+; RV64V-NEXT:    addi a0, a0, 256
+; RV64V-NEXT:    vlse64.v v16, (a0), a1
+; RV64V-NEXT:    ret
+;
+; RV32ZVE32F-LABEL: mgather_strided_split:
+; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    addi sp, sp, -512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 512
+; RV32ZVE32F-NEXT:    sw ra, 508(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s0, 504(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 500(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 496(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 492(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 488(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 484(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 480(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s8, 476(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 472(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s10, 468(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s11, 464(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    .cfi_offset ra, -4
+; RV32ZVE32F-NEXT:    .cfi_offset s0, -8
+; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
+; RV32ZVE32F-NEXT:    .cfi_offset s3, -16
+; RV32ZVE32F-NEXT:    .cfi_offset s4, -20
+; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
+; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
+; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
+; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
+; RV32ZVE32F-NEXT:    .cfi_offset s10, -44
+; RV32ZVE32F-NEXT:    .cfi_offset s11, -48
+; RV32ZVE32F-NEXT:    addi s0, sp, 512
+; RV32ZVE32F-NEXT:    .cfi_def_cfa s0, 0
+; RV32ZVE32F-NEXT:    andi sp, sp, -128
+; RV32ZVE32F-NEXT:    li a2, 32
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vid.v v8
+; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
+; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 216(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 244(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 236(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 228(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 220(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 240(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 224(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 7
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 196(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 192(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi a1, sp, 256
+; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    lw a1, 288(sp)
+; RV32ZVE32F-NEXT:    lw a2, 292(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 296(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 300(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 304(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 160(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 308(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 312(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 316(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 320(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 324(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 328(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    sw a3, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 332(sp)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw ra, 4(a1)
+; RV32ZVE32F-NEXT:    lw a1, 336(sp)
+; RV32ZVE32F-NEXT:    lw s10, 0(a2)
+; RV32ZVE32F-NEXT:    lw s8, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 340(sp)
+; RV32ZVE32F-NEXT:    lw s6, 0(a1)
+; RV32ZVE32F-NEXT:    lw s4, 4(a1)
+; RV32ZVE32F-NEXT:    lw a4, 344(sp)
+; RV32ZVE32F-NEXT:    lw s2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 348(sp)
+; RV32ZVE32F-NEXT:    lw t3, 0(a4)
+; RV32ZVE32F-NEXT:    lw t2, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 352(sp)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a7, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 356(sp)
+; RV32ZVE32F-NEXT:    lw a6, 0(a4)
+; RV32ZVE32F-NEXT:    lw a5, 4(a4)
+; RV32ZVE32F-NEXT:    lw a4, 360(sp)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a2, 364(sp)
+; RV32ZVE32F-NEXT:    lw s11, 0(a4)
+; RV32ZVE32F-NEXT:    lw s9, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 368(sp)
+; RV32ZVE32F-NEXT:    lw s7, 0(a2)
+; RV32ZVE32F-NEXT:    lw s5, 4(a2)
+; RV32ZVE32F-NEXT:    lw a3, 372(sp)
+; RV32ZVE32F-NEXT:    lw s3, 0(a1)
+; RV32ZVE32F-NEXT:    lw t6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a2, 376(sp)
+; RV32ZVE32F-NEXT:    lw t4, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 380(sp)
+; RV32ZVE32F-NEXT:    lw t1, 4(a3)
+; RV32ZVE32F-NEXT:    lw a4, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a5, 196(a0)
+; RV32ZVE32F-NEXT:    sw a6, 192(a0)
+; RV32ZVE32F-NEXT:    sw a7, 188(a0)
+; RV32ZVE32F-NEXT:    sw t0, 184(a0)
+; RV32ZVE32F-NEXT:    sw t2, 180(a0)
+; RV32ZVE32F-NEXT:    sw t3, 176(a0)
+; RV32ZVE32F-NEXT:    sw t5, 172(a0)
+; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw s4, 164(a0)
+; RV32ZVE32F-NEXT:    sw s6, 160(a0)
+; RV32ZVE32F-NEXT:    sw s8, 156(a0)
+; RV32ZVE32F-NEXT:    sw s10, 152(a0)
+; RV32ZVE32F-NEXT:    sw ra, 148(a0)
+; RV32ZVE32F-NEXT:    lw a5, 104(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 144(a0)
+; RV32ZVE32F-NEXT:    lw a5, 112(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 140(a0)
+; RV32ZVE32F-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 136(a0)
+; RV32ZVE32F-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 132(a0)
+; RV32ZVE32F-NEXT:    lw a5, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 128(a0)
+; RV32ZVE32F-NEXT:    lw a5, 128(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 124(a0)
+; RV32ZVE32F-NEXT:    lw a5, 132(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 120(a0)
+; RV32ZVE32F-NEXT:    lw a5, 136(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 116(a0)
+; RV32ZVE32F-NEXT:    lw a5, 140(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 112(a0)
+; RV32ZVE32F-NEXT:    lw a5, 144(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 108(a0)
+; RV32ZVE32F-NEXT:    lw a5, 148(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 104(a0)
+; RV32ZVE32F-NEXT:    lw a5, 152(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 100(a0)
+; RV32ZVE32F-NEXT:    lw a5, 156(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 96(a0)
+; RV32ZVE32F-NEXT:    lw a5, 160(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 92(a0)
+; RV32ZVE32F-NEXT:    lw a5, 164(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 88(a0)
+; RV32ZVE32F-NEXT:    lw a5, 168(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 84(a0)
+; RV32ZVE32F-NEXT:    lw a5, 172(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 80(a0)
+; RV32ZVE32F-NEXT:    lw a5, 176(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 76(a0)
+; RV32ZVE32F-NEXT:    lw a5, 180(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 72(a0)
+; RV32ZVE32F-NEXT:    lw a5, 184(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 68(a0)
+; RV32ZVE32F-NEXT:    lw a5, 188(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 64(a0)
+; RV32ZVE32F-NEXT:    lw a5, 208(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    lw a5, 216(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a1, 252(a0)
+; RV32ZVE32F-NEXT:    sw a2, 248(a0)
+; RV32ZVE32F-NEXT:    sw a3, 244(a0)
+; RV32ZVE32F-NEXT:    sw a4, 240(a0)
+; RV32ZVE32F-NEXT:    sw t1, 236(a0)
+; RV32ZVE32F-NEXT:    sw t4, 232(a0)
+; RV32ZVE32F-NEXT:    sw t6, 228(a0)
+; RV32ZVE32F-NEXT:    sw s3, 224(a0)
+; RV32ZVE32F-NEXT:    sw s5, 220(a0)
+; RV32ZVE32F-NEXT:    sw s7, 216(a0)
+; RV32ZVE32F-NEXT:    sw s9, 212(a0)
+; RV32ZVE32F-NEXT:    sw s11, 208(a0)
+; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 204(a0)
+; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 200(a0)
+; RV32ZVE32F-NEXT:    lw a1, 220(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
+; RV32ZVE32F-NEXT:    lw a1, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a1, 236(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 16(a0)
+; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
+; RV32ZVE32F-NEXT:    lw a1, 252(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a1, 192(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
+; RV32ZVE32F-NEXT:    lw a1, 196(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a1, 200(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 52(a0)
+; RV32ZVE32F-NEXT:    lw a1, 204(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 48(a0)
+; RV32ZVE32F-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 44(a0)
+; RV32ZVE32F-NEXT:    lw a1, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 40(a0)
+; RV32ZVE32F-NEXT:    lw a1, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 32(a0)
+; RV32ZVE32F-NEXT:    addi sp, s0, -512
+; RV32ZVE32F-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s0, 504(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 500(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 496(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 492(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 488(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 484(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 480(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s8, 476(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 472(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s10, 468(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s11, 464(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    addi sp, sp, 512
+; RV32ZVE32F-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_strided_split:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi sp, sp, -144
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 144
+; RV64ZVE32F-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s0, 128(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s1, 120(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s2, 112(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s3, 104(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s4, 96(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s5, 88(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s6, 80(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s7, 72(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s8, 64(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s9, 56(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s10, 48(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    sd s11, 40(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    .cfi_offset ra, -8
+; RV64ZVE32F-NEXT:    .cfi_offset s0, -16
+; RV64ZVE32F-NEXT:    .cfi_offset s1, -24
+; RV64ZVE32F-NEXT:    .cfi_offset s2, -32
+; RV64ZVE32F-NEXT:    .cfi_offset s3, -40
+; RV64ZVE32F-NEXT:    .cfi_offset s4, -48
+; RV64ZVE32F-NEXT:    .cfi_offset s5, -56
+; RV64ZVE32F-NEXT:    .cfi_offset s6, -64
+; RV64ZVE32F-NEXT:    .cfi_offset s7, -72
+; RV64ZVE32F-NEXT:    .cfi_offset s8, -80
+; RV64ZVE32F-NEXT:    .cfi_offset s9, -88
+; RV64ZVE32F-NEXT:    .cfi_offset s10, -96
+; RV64ZVE32F-NEXT:    .cfi_offset s11, -104
+; RV64ZVE32F-NEXT:    ld a2, 0(a1)
+; RV64ZVE32F-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 16(a1)
+; RV64ZVE32F-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 32(a1)
+; RV64ZVE32F-NEXT:    sd a2, 16(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 48(a1)
+; RV64ZVE32F-NEXT:    sd a2, 8(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a2, 64(a1)
+; RV64ZVE32F-NEXT:    sd a2, 0(sp) # 8-byte Folded Spill
+; RV64ZVE32F-NEXT:    ld a7, 80(a1)
+; RV64ZVE32F-NEXT:    ld t0, 96(a1)
+; RV64ZVE32F-NEXT:    ld t1, 112(a1)
+; RV64ZVE32F-NEXT:    ld t2, 128(a1)
+; RV64ZVE32F-NEXT:    ld t3, 144(a1)
+; RV64ZVE32F-NEXT:    ld t4, 160(a1)
+; RV64ZVE32F-NEXT:    ld t5, 176(a1)
+; RV64ZVE32F-NEXT:    ld t6, 192(a1)
+; RV64ZVE32F-NEXT:    ld s0, 208(a1)
+; RV64ZVE32F-NEXT:    ld s1, 224(a1)
+; RV64ZVE32F-NEXT:    ld s2, 240(a1)
+; RV64ZVE32F-NEXT:    ld s3, 256(a1)
+; RV64ZVE32F-NEXT:    ld s4, 272(a1)
+; RV64ZVE32F-NEXT:    ld s5, 288(a1)
+; RV64ZVE32F-NEXT:    ld s6, 304(a1)
+; RV64ZVE32F-NEXT:    ld s7, 320(a1)
+; RV64ZVE32F-NEXT:    ld s8, 336(a1)
+; RV64ZVE32F-NEXT:    ld s9, 352(a1)
+; RV64ZVE32F-NEXT:    ld s10, 368(a1)
+; RV64ZVE32F-NEXT:    ld s11, 384(a1)
+; RV64ZVE32F-NEXT:    ld ra, 400(a1)
+; RV64ZVE32F-NEXT:    ld a6, 416(a1)
+; RV64ZVE32F-NEXT:    ld a5, 432(a1)
+; RV64ZVE32F-NEXT:    ld a2, 496(a1)
+; RV64ZVE32F-NEXT:    ld a3, 480(a1)
+; RV64ZVE32F-NEXT:    ld a4, 464(a1)
+; RV64ZVE32F-NEXT:    ld a1, 448(a1)
+; RV64ZVE32F-NEXT:    sd a2, 248(a0)
+; RV64ZVE32F-NEXT:    sd a3, 240(a0)
+; RV64ZVE32F-NEXT:    sd a4, 232(a0)
+; RV64ZVE32F-NEXT:    sd a1, 224(a0)
+; RV64ZVE32F-NEXT:    sd a5, 216(a0)
+; RV64ZVE32F-NEXT:    sd a6, 208(a0)
+; RV64ZVE32F-NEXT:    sd ra, 200(a0)
+; RV64ZVE32F-NEXT:    sd s11, 192(a0)
+; RV64ZVE32F-NEXT:    sd s10, 184(a0)
+; RV64ZVE32F-NEXT:    sd s9, 176(a0)
+; RV64ZVE32F-NEXT:    sd s8, 168(a0)
+; RV64ZVE32F-NEXT:    sd s7, 160(a0)
+; RV64ZVE32F-NEXT:    sd s6, 152(a0)
+; RV64ZVE32F-NEXT:    sd s5, 144(a0)
+; RV64ZVE32F-NEXT:    sd s4, 136(a0)
+; RV64ZVE32F-NEXT:    sd s3, 128(a0)
+; RV64ZVE32F-NEXT:    sd s2, 120(a0)
+; RV64ZVE32F-NEXT:    sd s1, 112(a0)
+; RV64ZVE32F-NEXT:    sd s0, 104(a0)
+; RV64ZVE32F-NEXT:    sd t6, 96(a0)
+; RV64ZVE32F-NEXT:    sd t5, 88(a0)
+; RV64ZVE32F-NEXT:    sd t4, 80(a0)
+; RV64ZVE32F-NEXT:    sd t3, 72(a0)
+; RV64ZVE32F-NEXT:    sd t2, 64(a0)
+; RV64ZVE32F-NEXT:    sd t1, 56(a0)
+; RV64ZVE32F-NEXT:    sd t0, 48(a0)
+; RV64ZVE32F-NEXT:    sd a7, 40(a0)
+; RV64ZVE32F-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 24(a0)
+; RV64ZVE32F-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 8(a0)
+; RV64ZVE32F-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    sd a1, 0(a0)
+; RV64ZVE32F-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s0, 128(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s1, 120(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s2, 112(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s3, 104(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s4, 96(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s5, 88(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s6, 80(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s7, 72(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s8, 64(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s9, 56(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s10, 48(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    ld s11, 40(sp) # 8-byte Folded Reload
+; RV64ZVE32F-NEXT:    addi sp, sp, 144
+; RV64ZVE32F-NEXT:    ret
+  %ptrs = getelementptr inbounds i64, ptr %base, <32 x i64> <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38, i64 40, i64 42, i64 44, i64 46, i64 48, i64 50, i64 52, i64 54, i64 56, i64 58, i64 60, i64 62>
+  %x = call <32 x i64> @llvm.masked.gather.v32i64.v32p0(<32 x ptr> %ptrs, i32 8, <32 x i1> shufflevector(<32 x i1> insertelement(<32 x i1> poison, i1 true, i32 0), <32 x i1> poison, <32 x i32> zeroinitializer), <32 x i64> poison)
+  ret <32 x i64> %x
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index cd47720e3821..ead41b0717ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -3,30 +3,65 @@
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
 
 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vv_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
-; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vv_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV32-NEXT:    vle32.v v8, (a0), v0.t
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vv_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV64-NEXT:    vle32.v v8, (a0), v0.t
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %va = load <6 x i32>, ptr %a
   %vb = load <6 x i32>, ptr %b
   %vcc = load <6 x i1>, ptr %cc
@@ -36,31 +71,67 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vx_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vx_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vxm v8, v8, a0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vx_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vxm v8, v8, a0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %vb = load <6 x i32>, ptr %b
   %ahead = insertelement <6 x i32> poison, i32 %a, i32 0
   %va = shufflevector <6 x i32> %ahead, <6 x i32> poison, <6 x i32> zeroinitializer
@@ -71,31 +142,67 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vi_v6i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vi_v6i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vi_v6i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x i32>, ptr %b
   %a = insertelement <6 x i32> poison, i32 -1, i32 0
   %va = shufflevector <6 x i32> %a, <6 x i32> poison, <6 x i32> zeroinitializer
@@ -107,30 +214,65 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 
 
 define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vv_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a1, a2, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    srli a2, a2, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a2
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
-; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vv_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    slli a1, a2, 30
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a2, 27
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV32-NEXT:    vle32.v v8, (a0), v0.t
+; RV32-NEXT:    vse32.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vv_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    slli a1, a2, 62
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a2, 59
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
+; RV64-NEXT:    vle32.v v8, (a0), v0.t
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    ret
   %va = load <6 x float>, ptr %a
   %vb = load <6 x float>, ptr %b
   %vcc = load <6 x i1>, ptr %cc
@@ -140,31 +282,67 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vx_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vx_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vx_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x float>, ptr %b
   %ahead = insertelement <6 x float> poison, float %a, i32 0
   %va = shufflevector <6 x float> %ahead, <6 x float> poison, <6 x i32> zeroinitializer
@@ -175,31 +353,67 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 }
 
 define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
-; CHECK-LABEL: vselect_vfpzero_v6f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    lbu a1, 0(a1)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 2
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 3
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a0, a1, 4
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    srli a1, a1, 5
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vslidedown.vi v10, v10, 2
-; CHECK-NEXT:    vand.vi v10, v10, 1
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a2)
-; CHECK-NEXT:    ret
+; RV32-LABEL: vselect_vfpzero_v6f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    slli a0, a1, 30
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    slli a0, a1, 27
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslidedown.vi v10, v10, 2
+; RV32-NEXT:    vand.vi v10, v10, 1
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vselect_vfpzero_v6f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    slli a0, a1, 62
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    slli a0, a1, 59
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslidedown.vi v10, v10, 2
+; RV64-NEXT:    vand.vi v10, v10, 1
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    ret
   %vb = load <6 x float>, ptr %b
   %a = insertelement <6 x float> poison, float 0.0, i32 0
   %va = shufflevector <6 x float> %a, <6 x float> poison, <6 x i32> zeroinitializer
@@ -497,6 +711,3 @@ define <64 x i1> @vselect_v64i1(<64 x i1> %a, <64 x i1> %b, <64 x i1> %cc) {
   %v = select <64 x i1> %cc, <64 x i1> %a, <64 x i1> %b
   ret <64 x i1> %v
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
index b41b87aaf4d0..a320b44d2c6a 100644
--- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll
+++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
@@ -3,6 +3,14 @@
 ; RUN:   | FileCheck %s --check-prefix=RV32
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefix=RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss < %s \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV32-ZICFISS
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss < %s \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV64-ZICFISS
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \
+; RUN:    -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfiss,forced-sw-shadow-stack \
+; RUN:    -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
 
 define void @f1() shadowcallstack {
 ; RV32-LABEL: f1:
@@ -12,6 +20,14 @@ define void @f1() shadowcallstack {
 ; RV64-LABEL: f1:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f1:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f1:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    ret
   ret void
 }
 
@@ -25,6 +41,14 @@ define void @f2() shadowcallstack {
 ; RV64-LABEL: f2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    tail foo
+;
+; RV32-ZICFISS-LABEL: f2:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    tail foo
+;
+; RV64-ZICFISS-LABEL: f2:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    tail foo
   tail call void @foo()
   ret void
 }
@@ -65,6 +89,32 @@ define i32 @f3() shadowcallstack {
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    .cfi_restore gp
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f3:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    .cfi_offset ra, -4
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f3:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -16
+; RV64-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV64-ZICFISS-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    .cfi_offset ra, -8
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 16
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res = call i32 @bar()
   %res1 = add i32 %res, 1
   ret i32 %res
@@ -140,6 +190,68 @@ define i32 @f4() shadowcallstack {
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    .cfi_restore gp
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f4:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    .cfi_def_cfa_offset 16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    .cfi_offset ra, -4
+; RV32-ZICFISS-NEXT:    .cfi_offset s0, -8
+; RV32-ZICFISS-NEXT:    .cfi_offset s1, -12
+; RV32-ZICFISS-NEXT:    .cfi_offset s2, -16
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s0, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s1, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    mv s2, a0
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    add s0, s0, s1
+; RV32-ZICFISS-NEXT:    add a0, s2, a0
+; RV32-ZICFISS-NEXT:    add a0, s0, a0
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f4:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -32
+; RV64-ZICFISS-NEXT:    .cfi_def_cfa_offset 32
+; RV64-ZICFISS-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    .cfi_offset ra, -8
+; RV64-ZICFISS-NEXT:    .cfi_offset s0, -16
+; RV64-ZICFISS-NEXT:    .cfi_offset s1, -24
+; RV64-ZICFISS-NEXT:    .cfi_offset s2, -32
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s0, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s1, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    mv s2, a0
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    add s0, s0, s1
+; RV64-ZICFISS-NEXT:    add a0, s2, a0
+; RV64-ZICFISS-NEXT:    addw a0, s0, a0
+; RV64-ZICFISS-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 32
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res1 = call i32 @bar()
   %res2 = call i32 @bar()
   %res3 = call i32 @bar()
@@ -176,6 +288,28 @@ define i32 @f5() shadowcallstack nounwind {
 ; RV64-NEXT:    ld ra, -8(gp)
 ; RV64-NEXT:    addi gp, gp, -8
 ; RV64-NEXT:    ret
+;
+; RV32-ZICFISS-LABEL: f5:
+; RV32-ZICFISS:       # %bb.0:
+; RV32-ZICFISS-NEXT:    sspush ra
+; RV32-ZICFISS-NEXT:    addi sp, sp, -16
+; RV32-ZICFISS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-ZICFISS-NEXT:    call bar
+; RV32-ZICFISS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-ZICFISS-NEXT:    addi sp, sp, 16
+; RV32-ZICFISS-NEXT:    sspopchk ra
+; RV32-ZICFISS-NEXT:    ret
+;
+; RV64-ZICFISS-LABEL: f5:
+; RV64-ZICFISS:       # %bb.0:
+; RV64-ZICFISS-NEXT:    sspush ra
+; RV64-ZICFISS-NEXT:    addi sp, sp, -16
+; RV64-ZICFISS-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-ZICFISS-NEXT:    call bar
+; RV64-ZICFISS-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-ZICFISS-NEXT:    addi sp, sp, 16
+; RV64-ZICFISS-NEXT:    sspopchk ra
+; RV64-ZICFISS-NEXT:    ret
   %res = call i32 @bar()
   %res1 = add i32 %res, 1
   ret i32 %res
diff --git a/llvm/test/CodeGen/RISCV/signbit-test.ll b/llvm/test/CodeGen/RISCV/signbit-test.ll
index 69a9026d9af9..4e10fae06d88 100644
--- a/llvm/test/CodeGen/RISCV/signbit-test.ll
+++ b/llvm/test/CodeGen/RISCV/signbit-test.ll
@@ -303,7 +303,10 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind {
 ; RV32-NEXT:    bnez a1, .LBB10_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB10_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB10_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_clear_mask_i16_i8:
@@ -312,7 +315,10 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind {
 ; RV64-NEXT:    bnez a1, .LBB10_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB10_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB10_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 128
@@ -332,7 +338,10 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind {
 ; RV32-NEXT:    beqz a1, .LBB11_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB11_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB11_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_set_mask_i16_i8:
@@ -341,7 +350,10 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind {
 ; RV64-NEXT:    beqz a1, .LBB11_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB11_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB11_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 128
@@ -361,7 +373,10 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind {
 ; RV32-NEXT:    beqz a1, .LBB12_2
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    li a0, 42
-; RV32-NEXT:  .LBB12_2: # %f
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB12_2:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_set_mask_i16_i7:
@@ -370,7 +385,10 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind {
 ; RV64-NEXT:    beqz a1, .LBB12_2
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    li a0, 42
-; RV64-NEXT:  .LBB12_2: # %f
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB12_2:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
 entry:
   %a = and i16 %x, 64
diff --git a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
index 0860853ae9c0..de36bcdb9106 100644
--- a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
@@ -254,23 +254,43 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ugecmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ugecmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ugecmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ugecmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    xori a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ugecmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    xori a0, a0, 1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ugecmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    xori a0, a0, 1
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp uge i16 %tmp0, -256 ; ~0U << 8
   ret i1 %tmp1
@@ -471,23 +491,43 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 
 ; Slightly more canonical variant
 define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind {
-; RV32-LABEL: add_ugtcmp_i16_i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi a0, a0, -128
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    sltiu a0, a0, 255
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    ret
+; RV32I-LABEL: add_ugtcmp_i16_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    addi a0, a0, -128
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sltiu a0, a0, 255
+; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: add_ugtcmp_i16_i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi a0, a0, -128
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 56
-; RV64-NEXT:    sltiu a0, a0, 255
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
+; RV64I-LABEL: add_ugtcmp_i16_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    addi a0, a0, -128
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sltiu a0, a0, 255
+; RV64I-NEXT:    xori a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: add_ugtcmp_i16_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    zext.h a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, -128
+; RV32ZBB-NEXT:    srli a0, a0, 8
+; RV32ZBB-NEXT:    sltiu a0, a0, 255
+; RV32ZBB-NEXT:    xori a0, a0, 1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: add_ugtcmp_i16_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -128
+; RV64ZBB-NEXT:    srli a0, a0, 8
+; RV64ZBB-NEXT:    sltiu a0, a0, 255
+; RV64ZBB-NEXT:    xori a0, a0, 1
+; RV64ZBB-NEXT:    ret
   %tmp0 = add i16 %x, -128 ; ~0U << (8-1)
   %tmp1 = icmp ugt i16 %tmp0, -257 ; ~0U << 8 - 1
   ret i1 %tmp1
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
new file mode 100644
index 000000000000..3740dc675949
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -0,0 +1,387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck %s
+
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    ret
+  %add = add i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ret
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a0, a1, a0
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    ret
+  %add = mul i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_shl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ori a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a1, a0, 48
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    bltu a2, a1, .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    ret
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
+; CHECK-LABEL: overflow_add_no_consts:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, %limit
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: overflow_add_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    andi a1, a0, 255
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB5_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, -128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_add_positive_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a1, a0, 56
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    blt a1, a2, .LBB6_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: unsafe_add_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    beq a1, a2, .LBB7_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    beqz a1, .LBB8_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB8_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 0
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -2
+; CHECK-NEXT:    li a2, 251
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a1, a2, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    ret
+  %add = add i8 %a, -2
+  %cmp = icmp ult i8 %add, -5
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_sub_negative_const_limit:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a1, a0, 56
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    blt a1, a2, .LBB10_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; This is valid so long as the icmp immediate is sext.
+define i32 @sext_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: sext_sub_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -6
+; CHECK-NEXT:    li a2, -6
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -6
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    beqz a1, .LBB12_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    ret
+  %cmp.not = icmp eq i8 %a, 0
+  %res = select i1 %cmp.not, i32 16, i32 8
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -4
+; CHECK-NEXT:    li a2, 250
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a2, a1, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+; This is valid so long as the icmp immediate is sext.
+define i32 @sext_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: sext_sub_underflow_neg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -4
+; CHECK-NEXT:    li a2, -3
+; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    bltu a1, a2, .LBB14_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB14_2:
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ult i8 %sub, -3
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_imm_var(ptr nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_imm_var:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+entry:
+  ret i32 0
+}
+
+define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_var_imm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lbu a0, 0(a0)
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    sltiu a0, a0, 253
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, ptr %b, align 1
+  %sub = add nsw i8 %0, 8
+  %cmp = icmp ugt i8 %sub, -4
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_add_imm_var(ptr nocapture readnone %b) {
+; CHECK-LABEL: safe_add_imm_var:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i32 @safe_add_var_imm(ptr nocapture readnone %b) {
+; CHECK-LABEL: safe_add_var_imm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i8 @convert_add_order(i8 zeroext %arg) {
+; CHECK-LABEL: convert_add_order:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a1, a0, 1
+; CHECK-NEXT:    li a2, 50
+; CHECK-NEXT:    bltu a1, a2, .LBB19_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB19_2:
+; CHECK-NEXT:    addi a1, a1, -40
+; CHECK-NEXT:    sltiu a1, a1, 20
+; CHECK-NEXT:    li a2, 2
+; CHECK-NEXT:    sub a1, a2, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+  %shl = or i8 %arg, 1
+  %cmp.0 = icmp ult i8 %shl, 50
+  %sub = add nsw i8 %shl, -40
+  %cmp.1 = icmp ult i8 %sub, 20
+  %mask.sel.v = select i1 %cmp.1, i8 1, i8 2
+  %mask.sel = select i1 %cmp.0, i8 %mask.sel.v, i8 -1
+  %res = and i8 %mask.sel, %arg
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
+; CHECK-LABEL: underflow_if_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    sgtz a2, a2
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, 245
+; CHECK-NEXT:    bltu a0, a1, .LBB20_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 100
+; CHECK-NEXT:  .LBB20_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
+; CHECK-LABEL: underflow_if_sub_signext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    andi a1, a1, 255
+; CHECK-NEXT:    sgtz a2, a2
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, 245
+; CHECK-NEXT:    bltu a0, a1, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 100
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 4544cbaf8521..c016e8f31636 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -521,33 +521,35 @@ define void @test_urem_vec(ptr %X) nounwind {
 ;
 ; RV32MV-LABEL: test_urem_vec:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    lbu a1, 4(a0)
-; RV32MV-NEXT:    lw a2, 0(a0)
-; RV32MV-NEXT:    slli a1, a1, 10
-; RV32MV-NEXT:    srli a3, a2, 22
-; RV32MV-NEXT:    or a1, a3, a1
-; RV32MV-NEXT:    srli a3, a2, 11
+; RV32MV-NEXT:    lw a1, 0(a0)
+; RV32MV-NEXT:    andi a2, a1, 2047
 ; RV32MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.x v8, a2
+; RV32MV-NEXT:    lbu a2, 4(a0)
+; RV32MV-NEXT:    slli a3, a1, 10
+; RV32MV-NEXT:    srli a3, a3, 21
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV32MV-NEXT:    slli a2, a2, 10
+; RV32MV-NEXT:    srli a1, a1, 22
+; RV32MV-NEXT:    or a1, a1, a2
+; RV32MV-NEXT:    andi a1, a1, 2047
 ; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV32MV-NEXT:    lui a1, %hi(.LCPI4_0)
+; RV32MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
+; RV32MV-NEXT:    vle16.v v9, (a1)
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV32MV-NEXT:    li a1, 2047
-; RV32MV-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_0)
-; RV32MV-NEXT:    vle16.v v9, (a2)
-; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vid.v v10
 ; RV32MV-NEXT:    vsub.vv v8, v8, v10
 ; RV32MV-NEXT:    vmul.vv v8, v8, v9
 ; RV32MV-NEXT:    vadd.vv v9, v8, v8
-; RV32MV-NEXT:    lui a2, 41121
-; RV32MV-NEXT:    addi a2, a2, -1527
+; RV32MV-NEXT:    lui a1, 41121
+; RV32MV-NEXT:    addi a1, a1, -1527
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32MV-NEXT:    vmv.s.x v10, a2
+; RV32MV-NEXT:    vmv.s.x v10, a1
 ; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32MV-NEXT:    vsext.vf2 v11, v10
 ; RV32MV-NEXT:    vsll.vv v9, v9, v11
+; RV32MV-NEXT:    li a1, 2047
 ; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
@@ -585,31 +587,30 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    lwu a2, 0(a0)
 ; RV64MV-NEXT:    slli a1, a1, 32
 ; RV64MV-NEXT:    or a1, a2, a1
+; RV64MV-NEXT:    slli a2, a1, 42
+; RV64MV-NEXT:    srli a2, a2, 53
+; RV64MV-NEXT:    andi a3, a1, 2047
 ; RV64MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64MV-NEXT:    vmv.v.x v8, a1
-; RV64MV-NEXT:    slli a1, a1, 24
-; RV64MV-NEXT:    srli a1, a1, 24
-; RV64MV-NEXT:    srli a2, a1, 11
+; RV64MV-NEXT:    vmv.v.x v8, a3
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64MV-NEXT:    srli a1, a1, 22
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV64MV-NEXT:    lui a1, %hi(.LCPI4_0)
+; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
+; RV64MV-NEXT:    vle16.v v9, (a1)
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    li a1, 2047
-; RV64MV-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV64MV-NEXT:    addi a2, a2, %lo(.LCPI4_0)
-; RV64MV-NEXT:    vle16.v v9, (a2)
-; RV64MV-NEXT:    vand.vx v8, v8, a1
 ; RV64MV-NEXT:    vid.v v10
 ; RV64MV-NEXT:    vsub.vv v8, v8, v10
 ; RV64MV-NEXT:    vmul.vv v8, v8, v9
 ; RV64MV-NEXT:    vadd.vv v9, v8, v8
-; RV64MV-NEXT:    lui a2, 41121
-; RV64MV-NEXT:    addi a2, a2, -1527
+; RV64MV-NEXT:    lui a1, 41121
+; RV64MV-NEXT:    addi a1, a1, -1527
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64MV-NEXT:    vmv.s.x v10, a2
+; RV64MV-NEXT:    vmv.s.x v10, a1
 ; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64MV-NEXT:    vsext.vf2 v11, v10
 ; RV64MV-NEXT:    vsll.vv v9, v9, v11
+; RV64MV-NEXT:    li a1, 2047
 ; RV64MV-NEXT:    vand.vx v8, v8, a1
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64MV-NEXT:    vmv.v.i v10, 1
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 14afbae3afbd..621f54946e4c 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -111,28 +111,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; ILP32E-FPELIM-LABEL: va1:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    .cfi_def_cfa_offset 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    .cfi_def_cfa_offset 28
 ; ILP32E-FPELIM-NEXT:    mv a0, a1
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 16
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va1:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    .cfi_def_cfa_offset 36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    .cfi_offset ra, -28
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -32
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 24
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
@@ -142,9 +142,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    sw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 8
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va1:
@@ -191,28 +191,28 @@ define i32 @va1(ptr %fmt, ...) {
 ;
 ; LP64E-FPELIM-LABEL: va1:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 64
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a0, sp, 28
-; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
-; LP64E-FPELIM-NEXT:    lw a0, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 56
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a0, sp, 20
+; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
+; LP64E-FPELIM-NEXT:    lw a0, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    .cfi_def_cfa_offset 72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    .cfi_offset ra, -56
 ; LP64E-WITHFP-NEXT:    .cfi_offset s0, -64
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa s0, 48
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a0, s0, 12
@@ -222,9 +222,9 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a2, 16(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -292,24 +292,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va1_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
 ; ILP32E-FPELIM-NEXT:    mv a0, a1
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 16
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va1_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
@@ -318,9 +318,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 8
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va1_va_arg:
@@ -362,24 +362,24 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va1_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va1_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -388,9 +388,9 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -879,29 +879,29 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va2:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 12
-; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 19
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 27
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 23
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va2:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -915,9 +915,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va2:
@@ -959,24 +959,24 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 39
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 31
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -985,9 +985,9 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 23
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1076,29 +1076,29 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va2_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 19
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a1, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a0, a1, 4
-; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a1)
 ; ILP32E-FPELIM-NEXT:    addi a2, a1, 8
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a1)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va2_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -1112,9 +1112,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32E-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va2_va_arg:
@@ -1156,24 +1156,24 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va2_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    mv a0, a1
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 32
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 24
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va2_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    mv a0, a1
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
@@ -1182,9 +1182,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 16
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1389,31 +1389,31 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va3:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 20
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 27
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -20
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a3, sp, 35
-; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
+; ILP32E-FPELIM-NEXT:    addi a3, sp, 23
+; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
 ; ILP32E-FPELIM-NEXT:    add a2, a2, a3
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va3:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -32
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -28
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 8(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 4(s0)
@@ -1429,9 +1429,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 32
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 28
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va3:
@@ -1471,23 +1471,23 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 31
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 23
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -1495,9 +1495,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 15
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -1593,31 +1593,31 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va3_va_arg:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 27
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -20
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 8(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a3, a0, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 0(a0)
 ; ILP32E-FPELIM-NEXT:    addi a4, a0, 8
-; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a4, 4(a0)
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a3
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a2, a2, a4
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va3_va_arg:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -32
-; ILP32E-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 16
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -28
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 8(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 4(s0)
@@ -1633,9 +1633,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a2, a2, a4
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
-; ILP32E-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 32
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 28
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va3_va_arg:
@@ -1675,23 +1675,23 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va3_va_arg:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -48
-; LP64E-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 24
+; LP64E-FPELIM-NEXT:    addi sp, sp, -40
+; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a3, sp, 16
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 48
+; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va3_va_arg:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -64
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -56
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
@@ -1699,9 +1699,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a3, s0, 8
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
 ; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 64
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 56
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -2675,24 +2675,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; ILP32E-FPELIM-LABEL: va6_no_fixed_args:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    addi sp, sp, -32
-; ILP32E-FPELIM-NEXT:    sw a5, 28(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 8(sp)
-; ILP32E-FPELIM-NEXT:    addi a1, sp, 12
-; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    addi sp, sp, 32
+; ILP32E-FPELIM-NEXT:    addi sp, sp, -28
+; ILP32E-FPELIM-NEXT:    sw a5, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 12(sp)
+; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a0, 4(sp)
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 8
+; ILP32E-FPELIM-NEXT:    sw a1, 0(sp)
+; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: va6_no_fixed_args:
 ; ILP32E-WITHFP:       # %bb.0:
-; ILP32E-WITHFP-NEXT:    addi sp, sp, -48
-; ILP32E-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
-; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
+; ILP32E-WITHFP-NEXT:    addi sp, sp, -36
+; ILP32E-WITHFP-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; ILP32E-WITHFP-NEXT:    addi s0, sp, 12
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
@@ -2701,9 +2701,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a0, 0(s0)
 ; ILP32E-WITHFP-NEXT:    addi a1, s0, 4
 ; ILP32E-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
-; ILP32E-WITHFP-NEXT:    addi sp, sp, 48
+; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    lw s0, 4(sp) # 4-byte Folded Reload
+; ILP32E-WITHFP-NEXT:    addi sp, sp, 36
 ; ILP32E-WITHFP-NEXT:    ret
 ;
 ; LP64-LP64F-LP64D-FPELIM-LABEL: va6_no_fixed_args:
@@ -2745,24 +2745,24 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ;
 ; LP64E-FPELIM-LABEL: va6_no_fixed_args:
 ; LP64E-FPELIM:       # %bb.0:
-; LP64E-FPELIM-NEXT:    addi sp, sp, -64
-; LP64E-FPELIM-NEXT:    sd a5, 56(sp)
-; LP64E-FPELIM-NEXT:    sd a4, 48(sp)
-; LP64E-FPELIM-NEXT:    sd a3, 40(sp)
-; LP64E-FPELIM-NEXT:    sd a2, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64E-FPELIM-NEXT:    sd a0, 16(sp)
-; LP64E-FPELIM-NEXT:    addi a1, sp, 24
-; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    addi sp, sp, 64
+; LP64E-FPELIM-NEXT:    addi sp, sp, -56
+; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    addi a1, sp, 16
+; LP64E-FPELIM-NEXT:    sd a1, 0(sp)
+; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    ret
 ;
 ; LP64E-WITHFP-LABEL: va6_no_fixed_args:
 ; LP64E-WITHFP:       # %bb.0:
-; LP64E-WITHFP-NEXT:    addi sp, sp, -80
-; LP64E-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; LP64E-WITHFP-NEXT:    addi s0, sp, 32
+; LP64E-WITHFP-NEXT:    addi sp, sp, -72
+; LP64E-WITHFP-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
@@ -2771,9 +2771,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a0, 0(s0)
 ; LP64E-WITHFP-NEXT:    addi a1, s0, 8
 ; LP64E-WITHFP-NEXT:    sd a1, -24(s0)
-; LP64E-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; LP64E-WITHFP-NEXT:    addi sp, sp, 80
+; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; LP64E-WITHFP-NEXT:    addi sp, sp, 72
 ; LP64E-WITHFP-NEXT:    ret
   %va = alloca ptr
   call void @llvm.va_start(ptr %va)
@@ -2934,7 +2934,7 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 2020
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 24
 ; ILP32E-WITHFP-NEXT:    lui a0, 24414
-; ILP32E-WITHFP-NEXT:    addi a0, a0, -1740
+; ILP32E-WITHFP-NEXT:    addi a0, a0, -1748
 ; ILP32E-WITHFP-NEXT:    sub sp, sp, a0
 ; ILP32E-WITHFP-NEXT:    mv a0, a1
 ; ILP32E-WITHFP-NEXT:    sw a5, 20(s0)
@@ -2947,7 +2947,7 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; ILP32E-WITHFP-NEXT:    sub a2, s0, a2
 ; ILP32E-WITHFP-NEXT:    sw a1, -272(a2)
 ; ILP32E-WITHFP-NEXT:    lui a1, 24414
-; ILP32E-WITHFP-NEXT:    addi a1, a1, -1740
+; ILP32E-WITHFP-NEXT:    addi a1, a1, -1748
 ; ILP32E-WITHFP-NEXT:    add sp, sp, a1
 ; ILP32E-WITHFP-NEXT:    lw ra, 2016(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 2012(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/zcmp-with-float.ll b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
index 93f95e9709b6..3eb91eb9a09c 100644
--- a/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
+++ b/llvm/test/CodeGen/RISCV/zcmp-with-float.ll
@@ -5,7 +5,6 @@
 declare void @callee()
 
 ; Test the file could be compiled successfully.
-; .cfi_offset of fs0 is wrong here. It should be fixed by #66613.
 define float @foo(float %arg) {
 ; RV32-LABEL: foo:
 ; RV32:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
new file mode 100644
index 000000000000..3587ecb7f3c9
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-arg-regs.ll
@@ -0,0 +1,25 @@
+;; Test reserving argument registers.
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-o0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-O0
+; RUN: not llc < %s -mtriple=sparc-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+; RUN: not llc < %s -mtriple=sparc64-linux-gnu -mattr=+reserve-i0 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-I0
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function() {
+  call void @foo()
+  ret void
+}
+declare void @foo()
+
+; CHECK-RESERVED-O0: error:
+; CHECK-RESERVED-O0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+; CHECK-RESERVED-I0: error:
+; CHECK-RESERVED-I0-SAME: SPARC doesn't support function calls if any of the argument registers is reserved.
+define void @call_function_with_arg(i8 %in) {
+  call void @bar(i8 %in)
+  ret void
+}
+declare void @bar(i8)
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-named.ll b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
new file mode 100644
index 000000000000..91808be156c5
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-named.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l0 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references are catched as well.
+
+; CHECK-RESERVED-L0: %l0
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
new file mode 100644
index 000000000000..53ca045f1004
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
@@ -0,0 +1,14 @@
+; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+
+;; Ensure explicit register references for non-reserved registers
+;; are caught properly.
+
+; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable
+define void @set_reg(i32 zeroext %x) {
+entry:
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32)
+!0 = !{!"l0"}
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs.ll b/llvm/test/CodeGen/SPARC/reserved-regs.ll
index 27ebf4708135..bf46177a9566 100644
--- a/llvm/test/CodeGen/SPARC/reserved-regs.ll
+++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll
@@ -1,5 +1,14 @@
 ; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s
 
+;; Test reserve-* options.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-o1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-O1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-l1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-L1
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-I1
+
+;; Test multiple reserve-* options together.
+; RUN: llc -mtriple=sparc64-linux-gnu -mattr=+reserve-g1 -mattr=+reserve-o1 -mattr=+reserve-l1 -mattr=+reserve-i1 -o - %s | FileCheck %s --check-prefixes=CHECK-RESERVED-G1,CHECK-RESERVED-O1,CHECK-RESERVED-L1,CHECK-RESERVED-I1
+
 @g = common global [32 x i32] zeroinitializer, align 16
 @h = common global [16 x i64] zeroinitializer, align 16
 
@@ -16,6 +25,10 @@
 ; CHECK-NOT: %o6
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i32_regs() {
 entry:
@@ -100,6 +113,10 @@ entry:
 ; CHECK-NOT: %o7
 ; CHECK-NOT: %i6
 ; CHECK-NOT: %i7
+; CHECK-RESERVED-G1-NOT: %g1
+; CHECK-RESERVED-O1-NOT: %o1
+; CHECK-RESERVED-L1-NOT: %l1
+; CHECK-RESERVED-I1-NOT: %i1
 ; CHECK: ret
 define void @use_all_i64_regs() {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
new file mode 100644
index 000000000000..0bd1b5d776a9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Int8
+; CHECK-DAG: OpCapability FunctionPointersINTEL
+; CHECK-DAG: OpCapability Int64
+; CHECK: OpExtension "SPV_INTEL_function_pointers"
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyFunFp:.*]] = OpTypeFunction %[[TyVoid]] %[[TyInt64]]
+; CHECK-DAG: %[[ConstInt64:.*]] = OpConstant %[[TyInt64]] 42
+; CHECK-DAG: %[[TyPtrFunFp:.*]] = OpTypePointer Function %[[TyFunFp]]
+; CHECK-DAG: %[[ConstFunFp:.*]] = OpConstantFunctionPointerINTEL %[[TyPtrFunFp]] %[[DefFunFp:.*]]
+; CHECK: %[[FunPtr1:.*]] = OpBitcast %[[#]] %[[ConstFunFp]]
+; CHECK: %[[FunPtr2:.*]] = OpLoad %[[#]] %[[FunPtr1]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyInt64]] %[[FunPtr2]] %[[ConstInt64]]
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+; CHECK: %[[DefFunFp]] = OpFunction %[[TyVoid]] None %[[TyFunFp]]
+
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test() {
+entry:
+  %0 = load ptr, ptr @foo
+  %1 = call i64 %0(i64 42)
+  ret void
+}
+
+define void @foo(i64 %a) {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll
new file mode 100644
index 000000000000..89de098dead9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Int8
+; CHECK-DAG: OpCapability FunctionPointersINTEL
+; CHECK-DAG: OpCapability Int64
+; CHECK: OpExtension "SPV_INTEL_function_pointers"
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyFloat32:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyPtrInt8:.*]] = OpTypePointer Function %[[TyInt8]]
+; CHECK-DAG: %[[TyFunFp:.*]] = OpTypeFunction %[[TyFloat32]] %[[TyPtrInt8]]
+; CHECK-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyInt64]] %[[TyPtrInt8]] %[[TyPtrInt8]]
+; CHECK-DAG: %[[TyPtrFunFp:.*]] = OpTypePointer Function %[[TyFunFp]]
+; CHECK-DAG: %[[TyPtrFunBar:.*]] = OpTypePointer Function %[[TyFunBar]]
+; CHECK-DAG: %[[TyFunTest:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrInt8]] %[[TyPtrInt8]] %[[TyPtrInt8]]
+; CHECK: %[[FunTest:.*]] = OpFunction %[[TyVoid]] None %[[TyFunTest]]
+; CHECK: %[[ArgFp:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: %[[ArgData:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: %[[ArgBar:.*]] = OpFunctionParameter %[[TyPtrInt8]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyFloat32]] %[[ArgFp]] %[[ArgBar]]
+; CHECK: OpFunctionPointerCallINTEL %[[TyInt64]] %[[ArgBar]] %[[ArgFp]] %[[ArgData]]
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(ptr %fp, ptr %data, ptr %bar) {
+entry:
+  %0 = call spir_func float %fp(ptr %bar)
+  %1 = call spir_func i64 %bar(ptr %fp, ptr %data)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
new file mode 100644
index 000000000000..0e0b2a4dd6ec
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
@@ -0,0 +1,189 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_subgroups/cl_intel_sub_groups.ll
+
+;Source:
+;void __kernel test(float2 x, uint c,
+;                   read_only image2d_t image_in,
+;                   write_only image2d_t image_out,
+;                   int2 coord,
+;                   __global uint* p,
+;                   __global ushort* sp,
+;                   __global uchar* cp,
+;                   __global ulong* lp) {
+;    intel_sub_group_shuffle(x, c);
+;    intel_sub_group_shuffle_down(x, x, c);
+;    intel_sub_group_shuffle_up(x, x, c);
+;    intel_sub_group_shuffle_xor(x, c);
+;
+;    uint2 ui2 = intel_sub_group_block_read2(image_in, coord);
+;    intel_sub_group_block_write2(image_out, coord, ui2);
+;    ui2 = intel_sub_group_block_read2(p);
+;    intel_sub_group_block_write2(p, ui2);
+;
+;    ushort2 us2 = intel_sub_group_block_read_us2(image_in, coord);
+;    intel_sub_group_block_write_us2(image_out, coord, us2);
+;    us2 = intel_sub_group_block_read_us2(sp);
+;    intel_sub_group_block_write_us2(sp, us2);
+;
+;    uchar2 uc2 = intel_sub_group_block_read_uc2(image_in, coord);
+;    intel_sub_group_block_write_uc2(image_out, coord, uc2);
+;    uc2 = intel_sub_group_block_read_uc2(cp);
+;    intel_sub_group_block_write_uc2(cp, uc2);
+;
+;    ulong2 ul2 = intel_sub_group_block_read_ul2(image_in, coord);
+;    intel_sub_group_block_write_ul2(image_out, coord, ul2);
+;    ul2 = intel_sub_group_block_read_ul2(lp);
+;    intel_sub_group_block_write_ul2(lp, ul2);
+;}
+
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_subgroups %s -o - | FileCheck %s
+
+; CHECK-ERROR: LLVM ERROR: intel_sub_group_shuffle: the builtin requires the following SPIR-V extension: SPV_INTEL_subgroups
+
+; CHECK-DAG: Capability SubgroupShuffleINTEL
+; CHECK-DAG: Capability SubgroupBufferBlockIOINTEL
+; CHECK-DAG: Capability SubgroupImageBlockIOINTEL
+; CHECK: Extension "SPV_INTEL_subgroups"
+
+; CHECK-SPIRV-LABEL: Function
+; CHECK-SPIRV-LABEL: Label
+
+; CHECK: SubgroupShuffleINTEL
+; CHECK: SubgroupShuffleDownINTEL
+; CHECK: SubgroupShuffleUpINTEL
+; CHECK: SubgroupShuffleXorINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK: SubgroupImageBlockReadINTEL
+; CHECK: SubgroupImageBlockWriteINTEL
+; CHECK: SubgroupBlockReadINTEL
+; CHECK: SubgroupBlockWriteINTEL
+
+; CHECK-SPIRV-LABEL: Return
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+%opencl.image2d_ro_t = type opaque
+%opencl.image2d_wo_t = type opaque
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(<2 x float> %x, i32 %c, ptr addrspace(1) %image_in, ptr addrspace(1) %image_out, <2 x i32> %coord, ptr addrspace(1) %p, ptr addrspace(1) %sp, ptr addrspace(1) %cp, ptr addrspace(1) %lp) local_unnamed_addr #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 !kernel_arg_name !6 {
+entry:
+  %call = tail call spir_func <2 x float> @_Z23intel_sub_group_shuffleDv2_fj(<2 x float> %x, i32 %c) #2
+  %call1 = tail call spir_func <2 x float> @_Z28intel_sub_group_shuffle_downDv2_fS_j(<2 x float> %x, <2 x float> %x, i32 %c) #2
+  %call2 = tail call spir_func <2 x float> @_Z26intel_sub_group_shuffle_upDv2_fS_j(<2 x float> %x, <2 x float> %x, i32 %c) #2
+  %call3 = tail call spir_func <2 x float> @_Z27intel_sub_group_shuffle_xorDv2_fj(<2 x float> %x, i32 %c) #2
+
+  %call4 = tail call spir_func <2 x i32> @_Z27intel_sub_group_block_read214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z28intel_sub_group_block_write214ocl_image2d_woDv2_iDv2_j(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i32> %call4) #2
+  %call5 = tail call spir_func <2 x i32> @_Z27intel_sub_group_block_read2PU3AS1Kj(ptr addrspace(1) %p) #2
+  tail call spir_func void @_Z28intel_sub_group_block_write2PU3AS1jDv2_j(ptr addrspace(1) %p, <2 x i32> %call5) #2
+
+  %call6 = tail call spir_func <2 x i16> @_Z30intel_sub_group_block_read_us214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_us214ocl_image2d_woDv2_iDv2_t(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i16> %call6) #2
+  %call7 = tail call spir_func <2 x i16> @_Z30intel_sub_group_block_read_us2PU3AS1Kt(ptr addrspace(1) %sp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_us2PU3AS1tDv2_t(ptr addrspace(1) %sp, <2 x i16> %call7) #2
+
+  %call8 = tail call spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_uc214ocl_image2d_woDv2_iDv2_h(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i8> %call8) #2
+  %call9 = tail call spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc2PU3AS1Kh(ptr addrspace(1) %cp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_uc2PU3AS1hDv2_h(ptr addrspace(1) %cp, <2 x i8> %call9) #2
+
+  %call10 = tail call spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul214ocl_image2d_roDv2_i(ptr addrspace(1) %image_in, <2 x i32> %coord) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_ul214ocl_image2d_woDv2_iDv2_m(ptr addrspace(1) %image_out, <2 x i32> %coord, <2 x i64> %call10) #2
+  %call11 = tail call spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul2PU3AS1Km(ptr addrspace(1) %lp) #2
+  tail call spir_func void @_Z31intel_sub_group_block_write_ul2PU3AS1mDv2_m(ptr addrspace(1) %lp, <2 x i64> %call11) #2
+
+  ret void
+}
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z23intel_sub_group_shuffleDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z28intel_sub_group_shuffle_downDv2_fS_j(<2 x float>, <2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z26intel_sub_group_shuffle_upDv2_fS_j(<2 x float>, <2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x float> @_Z27intel_sub_group_shuffle_xorDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i32> @_Z27intel_sub_group_block_read214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z28intel_sub_group_block_write214ocl_image2d_woDv2_iDv2_j(ptr addrspace(1), <2 x i32>, <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i32> @_Z27intel_sub_group_block_read2PU3AS1Kj(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z28intel_sub_group_block_write2PU3AS1jDv2_j(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i16> @_Z30intel_sub_group_block_read_us214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_us214ocl_image2d_woDv2_iDv2_t(ptr addrspace(1), <2 x i32>, <2 x i16>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i16> @_Z30intel_sub_group_block_read_us2PU3AS1Kt(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_us2PU3AS1tDv2_t(ptr addrspace(1), <2 x i16>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_uc214ocl_image2d_woDv2_iDv2_h(ptr addrspace(1), <2 x i32>, <2 x i8>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i8> @_Z30intel_sub_group_block_read_uc2PU3AS1Kh(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_uc2PU3AS1hDv2_h(ptr addrspace(1), <2 x i8>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul214ocl_image2d_roDv2_i(ptr addrspace(1), <2 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_ul214ocl_image2d_woDv2_iDv2_m(ptr addrspace(1), <2 x i32>, <2 x i64>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func <2 x i64> @_Z30intel_sub_group_block_read_ul2PU3AS1Km(ptr addrspace(1)) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare spir_func void @_Z31intel_sub_group_block_write_ul2PU3AS1mDv2_m(ptr addrspace(1), <2 x i64>) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!opencl.ocl.version = !{!0}
+!opencl.spir.version = !{!0}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 0, i32 0, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"none", !"none", !"read_only", !"write_only", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"float2", !"uint", !"image2d_t", !"image2d_t", !"int2", !"uint*", !"ushort*", !"uchar*", !"ulong*"}
+!4 = !{!"float __attribute__((ext_vector_type(2)))", !"uint", !"image2d_t", !"image2d_t", !"int __attribute__((ext_vector_type(2)))", !"uint*", !"ushort*", !"uchar*", !"ulong*"}
+!5 = !{!"", !"", !"", !"", !"", !"", !"", !"", !""}
+!6 = !{!"x", !"c", !"image_in", !"image_out", !"coord", !"p", !"sp", !"cp", !"lp"}
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 3ca01cfa3a8f..570834fb6701 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -1821,44 +1821,42 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f32_v4i32_duplicate(<4 x float>
 define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i50:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    vldr s20, .LCPI28_0
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r6, s16
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    movtlt r7, #65534
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vldr s22, .LCPI28_1
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movtlt r7, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r7, #65535
-; CHECK-NEXT:    movtgt r7, #1
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it lt
@@ -1866,109 +1864,103 @@ define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    str.w r0, [r8]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    lsls r0, r5, #22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r0, r0, r6, lsr #10
-; CHECK-NEXT:    str.w r0, [r8, #20]
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    lsr.w r0, r9, #14
-; CHECK-NEXT:    orr.w r1, r0, r7, lsl #18
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    str.w r1, [r8, #8]
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r6, r9
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movwlt r10, #0
+; CHECK-NEXT:    movtlt r10, #65534
+; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r10, #65535
+; CHECK-NEXT:    movtgt r10, #1
+; CHECK-NEXT:    str.w r1, [r8, #20]
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    lsrs r2, r5, #10
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    str.w r2, [r8, #16]
+; CHECK-NEXT:    lsrs r2, r7, #10
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    strb.w r2, [r8, #24]
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    ubfx r2, r7, #14, #4
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r2, [r8, #12]
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    bfc r4, #18, #14
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r4, r9, lsl #18
-; CHECK-NEXT:    str.w r2, [r8, #4]
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r0, r0, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #22
-; CHECK-NEXT:    str.w r0, [r8, #16]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    orr.w r1, r1, r9, lsl #4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    strd r2, r1, [r8, #8]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    bfc r10, #18, #14
+; CHECK-NEXT:    orr.w r0, r10, r0, lsl #18
+; CHECK-NEXT:    str.w r0, [r8, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI28_0:
@@ -2120,21 +2112,22 @@ define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vldr s20, .LCPI30_0
-; CHECK-NEXT:    vmov r7, s19
-; CHECK-NEXT:    vmov r5, s16
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vldr s22, .LCPI30_1
-; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vldr s22, .LCPI30_0
+; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    vldr s20, .LCPI30_1
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
@@ -2150,7 +2143,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2162,7 +2155,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
@@ -2173,11 +2166,11 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
@@ -2192,7 +2185,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2204,7 +2197,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
@@ -2215,165 +2208,165 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r10, #7
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r10, #7
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    and r0, r10, #15
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    lsrs r2, r2, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    mvnlt r11, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r11, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    and r7, r11, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
+; CHECK-NEXT:    str.w r7, [r9, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r6, #7
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #7
-; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    and r2, r6, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orr.w r7, r5, r4, lsl #4
+; CHECK-NEXT:    str.w r7, [r9, #45]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    str.w r6, [r9, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r8, #7
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r8, #7
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    and r5, r8, #15
+; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    lsrl r4, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    strb.w r4, [r9, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    lsr.w r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r5, #7
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #7
-; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r1, r5, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI30_0:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
-; CHECK-NEXT:  .LCPI30_1:
 ; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:  .LCPI30_1:
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
     %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f)
     ret <4 x i100> %x
 }
@@ -3694,151 +3687,155 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
-; CHECK-NEXT:    vmov r6, r5, d8
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vmov r9, r8, d0
-; CHECK-NEXT:    str.w r8, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    vmov r5, r7, d8
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    vmov r9, r3, d0
+; CHECK-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vmov r7, r3, d0
-; CHECK-NEXT:    str r3, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    vmov r8, r3, d0
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    strd r1, r0, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    csel r4, r2, r4, ne
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    str.w r11, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    str.w r4, [r11, #8]
-; CHECK-NEXT:    str.w r9, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    str.w r4, [r10, #8]
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r10, r7
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    str.w r9, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    csel r7, r1, r0, ne
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
+; CHECK-NEXT:    movne.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r7, #0
-; CHECK-NEXT:    str.w r7, [r11, #4]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    ldr.w r11, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    movne r6, #0
+; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    str.w r10, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [r0, #4]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    mov r10, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    csel r7, r1, r0, ne
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    cmp.w r9, #0
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r5, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    movne.w r6, #-1
+; CHECK-NEXT:    str r5, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    vmov r9, r8, d9
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r7, #0
-; CHECK-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    str r7, [r0]
-; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    movne r6, #0
+; CHECK-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    str r6, [r0]
+; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r10
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    str.w r10, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r11
+; CHECK-NEXT:    str.w r11, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __fixdfti
+; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    cmp.w r11, #0
-; CHECK-NEXT:    strd r2, r0, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r10, r1, r11, ne
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    mvneq r10, #7
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r10, #-1
+; CHECK-NEXT:    movne.w r10, #7
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r10, #0
+; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r10, #0
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r6, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
@@ -3852,21 +3849,21 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r4, #0
-; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsr.w r0, r10, #28
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r10
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    str r0, [r1, #20]
+; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    mov r5, r6
+; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
@@ -3876,73 +3873,75 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    csel r11, r1, r0, ne
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r11, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r11, #0
-; CHECK-NEXT:    ldr r5, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsr.w r0, r11, #28
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
-; CHECK-NEXT:    mov r1, r8
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    str r0, [r5, #16]
+; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r11
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    lsrl r4, r1, #28
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    mov r0, r9
 ; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r10, r6
+; CHECK-NEXT:    mov r7, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    mvneq r0, #7
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #7
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    mov r1, r8
 ; CHECK-NEXT:    mov r2, r9
 ; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    csel r6, r1, r0, ne
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    lsr.w r0, r4, #28
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r6, #0
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    strb r0, [r5, #24]
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r10, #15
 ; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    ldr r3, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strd r4, r0, [r2, #16]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    strb r6, [r2, #24]
+; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r4
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    mvneq r0, #7
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #7
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    mov r1, r6
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r3, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpun
 ; CHECK-NEXT:    cmp r0, #0
@@ -3950,7 +3949,7 @@ define arm_aapcs_vfpcc <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    movne r4, #0
 ; CHECK-NEXT:    and r0, r4, #15
 ; CHECK-NEXT:    orr.w r0, r0, r11, lsl #4
-; CHECK-NEXT:    str r0, [r5, #12]
+; CHECK-NEXT:    str.w r0, [r8, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
@@ -4694,107 +4693,127 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i19:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldr s12, .LCPI46_0
-; CHECK-NEXT:    vcvtt.f32.f16 s15, s3
-; CHECK-NEXT:    vldr s14, .LCPI46_1
-; CHECK-NEXT:    vcvtb.f32.f16 s7, s0
-; CHECK-NEXT:    vmaxnm.f32 s16, s15, s12
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s8, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s2, s2
-; CHECK-NEXT:    vcvtb.f32.f16 s3, s3
-; CHECK-NEXT:    vmaxnm.f32 s6, s4, s12
-; CHECK-NEXT:    vmaxnm.f32 s10, s8, s12
-; CHECK-NEXT:    vmaxnm.f32 s5, s1, s12
-; CHECK-NEXT:    vmaxnm.f32 s9, s7, s12
-; CHECK-NEXT:    vmaxnm.f32 s11, s0, s12
-; CHECK-NEXT:    vmaxnm.f32 s13, s2, s12
-; CHECK-NEXT:    vminnm.f32 s16, s16, s14
-; CHECK-NEXT:    vmaxnm.f32 s12, s3, s12
-; CHECK-NEXT:    vcvt.s32.f32 s16, s16
-; CHECK-NEXT:    vminnm.f32 s12, s12, s14
-; CHECK-NEXT:    vminnm.f32 s13, s13, s14
-; CHECK-NEXT:    vcvt.s32.f32 s12, s12
-; CHECK-NEXT:    vminnm.f32 s9, s9, s14
-; CHECK-NEXT:    vcvt.s32.f32 s13, s13
-; CHECK-NEXT:    vminnm.f32 s11, s11, s14
-; CHECK-NEXT:    vcvt.s32.f32 s11, s11
-; CHECK-NEXT:    vminnm.f32 s5, s5, s14
-; CHECK-NEXT:    vcvt.s32.f32 s9, s9
-; CHECK-NEXT:    vminnm.f32 s10, s10, s14
-; CHECK-NEXT:    vcmp.f32 s15, s15
-; CHECK-NEXT:    vminnm.f32 s6, s6, s14
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r2, r1, #11
-; CHECK-NEXT:    vcmp.f32 s3, s3
-; CHECK-NEXT:    strb r2, [r0, #18]
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    vldr s4, .LCPI46_0
+; CHECK-NEXT:    vcvtb.f32.f16 s8, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vldr s6, .LCPI46_1
+; CHECK-NEXT:    vmaxnm.f32 s5, s1, s4
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vmaxnm.f32 s14, s12, s4
+; CHECK-NEXT:    vminnm.f32 s5, s5, s6
+; CHECK-NEXT:    vmaxnm.f32 s7, s0, s4
+; CHECK-NEXT:    vminnm.f32 s7, s7, s6
+; CHECK-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-NEXT:    vcvt.s32.f32 s7, s7
+; CHECK-NEXT:    vminnm.f32 s14, s14, s6
+; CHECK-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-NEXT:    vmaxnm.f32 s10, s8, s4
+; CHECK-NEXT:    vminnm.f32 s10, s10, s6
+; CHECK-NEXT:    vcmp.f32 s1, s1
+; CHECK-NEXT:    vcvt.s32.f32 s10, s10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    mov.w r7, #0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    ubfx r2, r3, #14, #5
-; CHECK-NEXT:    vcvt.s32.f32 s5, s5
-; CHECK-NEXT:    orr.w r1, r2, r1, lsl #5
-; CHECK-NEXT:    vcmp.f32 s2, s2
-; CHECK-NEXT:    strh r1, [r0, #16]
-; CHECK-NEXT:    vmov lr, s13
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    bfc r2, #19, #13
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w lr, #0
-; CHECK-NEXT:    ubfx r1, lr, #1, #18
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #18
-; CHECK-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vcmp.f32 s12, s12
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r12, s11
-; CHECK-NEXT:    str r1, [r0, #12]
-; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vcmp.f32 s8, s8
+; CHECK-NEXT:    lsll r2, r7, #19
+; CHECK-NEXT:    bfc r1, #19, #13
+; CHECK-NEXT:    vmov r12, s14
+; CHECK-NEXT:    vmaxnm.f32 s8, s0, s4
+; CHECK-NEXT:    orr.w r1, r1, r2
+; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs.w r12, #0
-; CHECK-NEXT:    vcmp.f32 s7, s7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vminnm.f32 s8, s8, s6
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vcvt.s32.f32 s8, s8
 ; CHECK-NEXT:    bfc r3, #19, #13
-; CHECK-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-NEXT:    orr.w r3, r3, r12, lsl #19
-; CHECK-NEXT:    str r3, [r0]
-; CHECK-NEXT:    vcmp.f32 s1, s1
-; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    lsrl r2, r1, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsrl r4, r9, #26
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
+; CHECK-NEXT:    mov lr, r1
+; CHECK-NEXT:    orr.w r1, r4, r2
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vcmp.f32 s8, s8
-; CHECK-NEXT:    bfc r3, #19, #13
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    lsll r2, r5, #12
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    ubfx r2, r1, #7, #12
-; CHECK-NEXT:    vcmp.f32 s4, s4
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #12
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r2, lr, lsl #31
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orrs r2, r1
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r4, r1, #31
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    orrs r2, r4
 ; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    ubfx r3, r12, #13, #6
+; CHECK-NEXT:    orr.w r2, r7, r3, lsl #6
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
+; CHECK-NEXT:    orr.w r3, r2, r12, lsl #25
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
 ; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    orr.w r2, r3, r2, lsl #6
-; CHECK-NEXT:    orr.w r1, r2, r1, lsl #25
-; CHECK-NEXT:    str r1, [r0, #4]
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsll r2, r7, #5
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    vmov r7, s2
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    lsrl r4, r11, #14
+; CHECK-NEXT:    orrs r2, r4
+; CHECK-NEXT:    strh r2, [r0, #16]
+; CHECK-NEXT:    str r3, [r0, #4]
+; CHECK-NEXT:    lsrs r2, r2, #16
+; CHECK-NEXT:    strb r2, [r0, #18]
+; CHECK-NEXT:    orr.w r2, r9, lr
+; CHECK-NEXT:    orrs r2, r5
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    orr.w r1, r1, r7, lsl #18
+; CHECK-NEXT:    str r1, [r0, #12]
+; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI46_0:
@@ -4844,42 +4863,40 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
-; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s19
+; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vldr s22, .LCPI48_1
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    vldr s24, .LCPI48_1
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
+; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r6, s24
-; CHECK-NEXT:    vldr s20, .LCPI48_0
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r7, s20
+; CHECK-NEXT:    vldr s22, .LCPI48_0
+; CHECK-NEXT:    vmov r6, s28
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s26, s22
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
@@ -4887,263 +4904,244 @@ define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11, #25]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s20, s24
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    vcmp.f32 s20, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
+; CHECK-NEXT:    vcmp.f32 s20, s20
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    vcmp.f32 s30, s24
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    str.w r0, [r9]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r0, [r11]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s30, s22
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    lsls r0, r4, #22
-; CHECK-NEXT:    orr.w r7, r0, r7, lsr #10
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    lsll r4, r7, #22
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    movtlt r6, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r6, #65535
-; CHECK-NEXT:    movtgt r6, #1
+; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    str.w r7, [r11, #45]
+; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    lsrs r0, r5, #14
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #18
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
-; CHECK-NEXT:    str.w r0, [r11, #33]
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s18
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    orr.w r0, r1, r7
+; CHECK-NEXT:    str.w r0, [r9, #45]
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    orrs r4, r2
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
+; CHECK-NEXT:    lsrs r0, r5, #10
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
+; CHECK-NEXT:    str.w r4, [r9, #41]
+; CHECK-NEXT:    strb.w r0, [r9, #49]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    lsrl r4, r1, #14
+; CHECK-NEXT:    orr.w r6, r1, r6, lsl #4
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movwlt r9, #0
-; CHECK-NEXT:    movtlt r9, #65534
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r9, #65535
-; CHECK-NEXT:    movtgt r9, #1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    lsl.w r0, r9, #22
-; CHECK-NEXT:    orr.w r0, r0, r1, lsr #10
-; CHECK-NEXT:    str.w r0, [r11, #20]
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    str.w r6, [r9, #37]
+; CHECK-NEXT:    str.w r4, [r9, #33]
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movtlt r4, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r4, #65535
+; CHECK-NEXT:    movtgt r4, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movwlt r10, #0
 ; CHECK-NEXT:    movtlt r10, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    lsr.w r0, r8, #14
+; CHECK-NEXT:    vcmp.f32 s26, s22
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r10, #65535
 ; CHECK-NEXT:    movtgt r10, #1
+; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #18
-; CHECK-NEXT:    str.w r0, [r11, #8]
-; CHECK-NEXT:    lsrs r0, r4, #10
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s19
-; CHECK-NEXT:    strb.w r0, [r11, #49]
+; CHECK-NEXT:    bfc r10, #18, #14
+; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    orr.w r0, r10, r7, lsl #18
+; CHECK-NEXT:    str.w r0, [r9, #29]
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    ubfx r0, r6, #14, #4
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
-; CHECK-NEXT:    str.w r0, [r11, #37]
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    movtlt r0, #65534
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r0, #65535
-; CHECK-NEXT:    movtgt r0, #1
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    bfc r0, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r5, lsl #18
-; CHECK-NEXT:    str.w r0, [r11, #29]
-; CHECK-NEXT:    lsr.w r0, r9, #10
-; CHECK-NEXT:    strb.w r0, [r11, #24]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    ubfx r2, r10, #14, #4
+; CHECK-NEXT:    vcmp.f32 s18, s24
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r2, [r11, #12]
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vcmp.f32 s20, s24
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r10, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    lsll r6, r1, #22
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    movtlt r2, #65534
+; CHECK-NEXT:    movwlt r8, #0
+; CHECK-NEXT:    movtlt r8, #65534
+; CHECK-NEXT:    vcmp.f32 s20, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r2, #65535
-; CHECK-NEXT:    movtgt r2, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movwgt r8, #65535
+; CHECK-NEXT:    movtgt r8, #1
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    str.w r1, [r9, #20]
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s24
+; CHECK-NEXT:    orr.w r2, r10, r6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #1
+; CHECK-NEXT:    str.w r2, [r9, #16]
+; CHECK-NEXT:    lsrs r2, r4, #10
+; CHECK-NEXT:    vcmp.f32 s16, s24
+; CHECK-NEXT:    strb.w r2, [r9, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    bfc r2, #18, #14
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    orr.w r2, r2, r8, lsl #18
-; CHECK-NEXT:    str.w r2, [r11, #4]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    bfc r4, #18, #14
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r2, r7, #28
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    orr.w r2, r2, r4, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #22
-; CHECK-NEXT:    str.w r2, [r11, #41]
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    lsrs r0, r0, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #22
-; CHECK-NEXT:    str.w r0, [r11, #16]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    vcmp.f32 s20, s20
+; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    strd r2, r1, [r9, #8]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    bfc r8, #18, #14
+; CHECK-NEXT:    orr.w r0, r8, r0, lsl #18
+; CHECK-NEXT:    str.w r0, [r9, #4]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -5426,477 +5424,482 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
-; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s21, s19
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s19
+; CHECK-NEXT:    vmov r0, s21
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s17
+; CHECK-NEXT:    vcvtb.f32.f16 s30, s18
+; CHECK-NEXT:    vldr s20, .LCPI50_2
+; CHECK-NEXT:    vmov r8, s24
+; CHECK-NEXT:    vmov r4, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
+; CHECK-NEXT:    vmov r6, s28
+; CHECK-NEXT:    vmov r5, s30
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
-; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vldr s24, .LCPI50_2
-; CHECK-NEXT:    vldr s20, .LCPI50_3
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s19
-; CHECK-NEXT:    vcmp.f32 s30, s24
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
+; CHECK-NEXT:    vldr s18, .LCPI50_3
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s21, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r4, #83]
+; CHECK-NEXT:    vcmp.f32 s21, s20
+; CHECK-NEXT:    str.w r2, [r9, #83]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #79]
+; CHECK-NEXT:    str.w r1, [r9, #79]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
+; CHECK-NEXT:    vcmp.f32 s21, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s21, s21
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #75]
-; CHECK-NEXT:    vmov r9, s19
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    mov r0, r3
-; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    str.w r0, [r9, #75]
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r4, #58]
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    str.w r2, [r9, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #54]
+; CHECK-NEXT:    str.w r1, [r9, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #50]
+; CHECK-NEXT:    str.w r0, [r9, #50]
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s26, s24
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    vcmp.f32 s28, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
+; CHECK-NEXT:    vcmp.f32 s28, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s22, s24
-; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    vcmp.f32 s26, s18
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
+; CHECK-NEXT:    vcmp.f32 s21, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r6, r1, #28
+; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
-; CHECK-NEXT:    str.w r6, [r4, #95]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r7, #7
+; CHECK-NEXT:    vcmp.f32 s21, s20
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt r7, #7
+; CHECK-NEXT:    vcmp.f32 s21, s21
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #91]
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    str.w r1, [r9, #87]
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
-; CHECK-NEXT:    lsrs r1, r2, #28
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s18
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r1, r3, lsl #4
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    strb.w r2, [r4, #99]
-; CHECK-NEXT:    vcmp.f32 s30, s24
+; CHECK-NEXT:    movvs.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    mvnlt r5, #7
 ; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r5, #7
-; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r2, r5, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #87]
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
+; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    mov r7, r3
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r0, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r0, #7
+; CHECK-NEXT:    vcmp.f32 s28, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #70]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #66]
-; CHECK-NEXT:    vmov r1, s18
-; CHECK-NEXT:    vcmp.f32 s19, s24
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    vcmp.f32 s26, s18
+; CHECK-NEXT:    ldr r5, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    mvnlt r5, #7
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt r5, #7
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    and r5, r5, #15
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #12]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r11, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    b.w .LBB50_3
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI50_2:
-; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:  .LCPI50_3:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
+; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB50_3:
-; CHECK-NEXT:    strb.w r2, [r4, #74]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r2, r7, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #62]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s18, s24
+; CHECK-NEXT:    movvs.w r11, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    lsrl r6, r11, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #95]
+; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    vcmp.f32 s24, s18
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mvnlt r6, #7
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r6, #7
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s18, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    and r5, r6, #15
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    strb.w r10, [r9, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mvnlt r7, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    movgt r7, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s22, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s18, s24
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r4, r7
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r10, #7
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #7
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    and r2, r10, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s16, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    and r5, r7, #15
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    str.w r6, [r9, #70]
+; CHECK-NEXT:    str.w r8, [r9, #66]
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    strb.w r4, [r9, #74]
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    mvnlt r4, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt r4, #7
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r4
+; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    ldr.w r12, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    lsrl r12, r5, #28
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    vcmp.f32 s30, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s24
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    lsr.w r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
-; CHECK-NEXT:    str r1, [r4, #16]
+; CHECK-NEXT:    str.w r7, [r9, #45]
+; CHECK-NEXT:    str.w r12, [r9, #41]
+; CHECK-NEXT:    strb.w r6, [r9, #49]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -5904,28 +5907,44 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s22, s24
+; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r8, #7
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #7
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    and r1, r8, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  @ %bb.4:
     %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f)
     ret <8 x i100> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 8ea12bd1fc0d..2b6d0da53170 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -1506,110 +1506,110 @@ define arm_aapcs_vfpcc <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vldr s20, .LCPI28_0
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov r6, s19
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vldr s20, .LCPI28_0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #3
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r6, #65535
-; CHECK-NEXT:    movtgt r6, #3
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r8]
+; CHECK-NEXT:    str.w r6, [r8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsl.w r0, r6, #22
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    orr.w r0, r0, r4, lsr #10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r0, [r8, #20]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    lsr.w r0, r9, #14
-; CHECK-NEXT:    orr.w r1, r0, r10, lsl #18
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    str.w r1, [r8, #8]
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    lsrs r2, r6, #10
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    orrs r1, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    movwgt r9, #65535
+; CHECK-NEXT:    movtgt r9, #3
+; CHECK-NEXT:    str.w r1, [r8, #20]
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    bfc r9, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str.w r2, [r8, #16]
+; CHECK-NEXT:    lsrs r2, r7, #10
+; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    strb.w r2, [r8, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    ubfx r2, r10, #14, #4
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    lsrs r0, r0, #28
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    bfc r5, #18, #14
-; CHECK-NEXT:    str.w r2, [r8, #12]
-; CHECK-NEXT:    orr.w r2, r5, r9, lsl #18
-; CHECK-NEXT:    str.w r2, [r8, #4]
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #22
-; CHECK-NEXT:    str.w r0, [r8, #16]
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    orr.w r0, r9, r0, lsl #18
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    orr.w r1, r1, r10, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r8, #8]
+; CHECK-NEXT:    str.w r0, [r8, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -1729,32 +1729,31 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vldr s20, .LCPI30_0
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vmov r7, s19
 ; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    mov r9, r3
+; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    vldr s20, .LCPI30_0
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r6, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r8, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1762,18 +1761,20 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r8, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #25]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    str.w r0, [r8, #25]
+; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
@@ -1784,7 +1785,7 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str.w r2, [r8, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1792,126 +1793,126 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    str.w r1, [r8, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movgt.w r9, #15
+; CHECK-NEXT:    and r0, r9, #15
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
+; CHECK-NEXT:    str.w r0, [r8, #37]
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    mov r11, r3
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r2, r2, #28
-; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r10, #15
+; CHECK-NEXT:    and r7, r10, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r6, #15
-; CHECK-NEXT:    and r2, r6, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    str.w r7, [r8, #45]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    str.w r4, [r8, #41]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movgt.w r11, #15
+; CHECK-NEXT:    and r5, r11, #15
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    strb.w r6, [r8, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r1, r5, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r8, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r8, #24]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI30_0:
@@ -2923,195 +2924,196 @@ define arm_aapcs_vfpcc <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vldr d0, .LCPI40_0
-; CHECK-NEXT:    vmov r9, r5, d8
-; CHECK-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    vmov r6, r5, d8
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    vmov r2, r9, d0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r7, r2
-; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    vldr d0, .LCPI40_1
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    str r2, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    vmov r11, r3, d0
+; CHECK-NEXT:    str r3, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    str r5, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp.w r8, #0
-; CHECK-NEXT:    strd r1, r0, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r0, r2, r8, ne
-; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    cmp.w r11, #0
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    strd r1, r0, [sp, #20] @ 8-byte Folded Spill
+; CHECK-NEXT:    csel r0, r2, r4, ne
+; CHECK-NEXT:    str r3, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    str.w r0, [r8, #8]
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r6
-; CHECK-NEXT:    mov r11, r7
-; CHECK-NEXT:    str r0, [r4, #8]
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    str r5, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    str r7, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    str.w r9, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r7, r6
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r11
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    str.w r0, [r8, #4]
+; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    strd r8, r7, [sp, #28] @ 8-byte Folded Spill
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    ldr r7, [sp, #44] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r9, r7
-; CHECK-NEXT:    str.w r10, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r5, r11
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    vmov r8, r11, d9
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    ldr r5, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    str r0, [r6]
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r7
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    mov r6, r5
+; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    mov r5, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    cmp.w r10, #0
-; CHECK-NEXT:    strd r2, r0, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    csel r7, r1, r10, ne
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r6
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    strd r0, r2, [sp, #20] @ 8-byte Folded Spill
+; CHECK-NEXT:    csel r0, r3, r7, ne
+; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
-; CHECK-NEXT:    mov r4, r6
+; CHECK-NEXT:    movne r0, #15
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr.w r10, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    mov r2, r9
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpge
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    csel r9, r1, r0, ne
-; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    mov r2, r10
+; CHECK-NEXT:    mov r3, r9
+; CHECK-NEXT:    csel r0, r1, r0, ne
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r9, #-1
-; CHECK-NEXT:    ldr r6, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r0, r7, #28
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r7
+; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    orr.w r0, r0, r9, lsl #4
-; CHECK-NEXT:    str r0, [r6, #20]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    str.w r10, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r3, r10
-; CHECK-NEXT:    mov r5, r10
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov r2, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r3, r9
 ; CHECK-NEXT:    csel r4, r1, r0, ne
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r4, #-1
-; CHECK-NEXT:    lsrs r0, r4, #28
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
-; CHECK-NEXT:    str r0, [r6, #16]
-; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    mov r10, r4
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    ldr.w r10, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov r1, r11
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    mov r1, r11
-; CHECK-NEXT:    ldr.w r11, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    ldr.w r11, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r2, r7
+; CHECK-NEXT:    mov r9, r7
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, #15
-; CHECK-NEXT:    lsr.w r1, r9, #28
-; CHECK-NEXT:    ldr.w r9, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    strb.w r0, [r9, #24]
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr r5, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r5
+; CHECK-NEXT:    movne.w r0, #-1
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    orr.w r1, r5, r0, lsl #4
+; CHECK-NEXT:    strd r10, r1, [r2, #16]
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r1, #15
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    strb r0, [r2, #24]
+; CHECK-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r3, r2, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r7
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r1, r5
-; CHECK-NEXT:    mov r2, r11
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r3, r11
 ; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csel r0, r1, r0, ne
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    cmp.w r10, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne r0, #15
 ; CHECK-NEXT:    and r0, r0, #15
 ; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #12]
+; CHECK-NEXT:    str.w r0, [r8, #12]
 ; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
@@ -3639,73 +3641,90 @@ define arm_aapcs_vfpcc <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i19> @test_unsigned_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-LABEL: test_unsigned_v8f16_v8i19:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldr s6, .LCPI46_1
-; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s12, s2
-; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s14, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s1, s3
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s2, s2
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r9, r11, lr}
 ; CHECK-NEXT:    vldr s4, .LCPI46_0
-; CHECK-NEXT:    vcvtt.f32.f16 s3, s3
-; CHECK-NEXT:    vmaxnm.f32 s8, s8, s6
-; CHECK-NEXT:    vmaxnm.f32 s10, s10, s6
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT:    vmaxnm.f32 s12, s12, s6
-; CHECK-NEXT:    vmaxnm.f32 s14, s14, s6
-; CHECK-NEXT:    vmaxnm.f32 s2, s2, s6
-; CHECK-NEXT:    vmaxnm.f32 s1, s1, s6
-; CHECK-NEXT:    vmaxnm.f32 s6, s3, s6
-; CHECK-NEXT:    vminnm.f32 s8, s8, s4
-; CHECK-NEXT:    vminnm.f32 s10, s10, s4
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
-; CHECK-NEXT:    vminnm.f32 s12, s12, s4
-; CHECK-NEXT:    vminnm.f32 s14, s14, s4
-; CHECK-NEXT:    vminnm.f32 s2, s2, s4
-; CHECK-NEXT:    vminnm.f32 s1, s1, s4
-; CHECK-NEXT:    vminnm.f32 s4, s6, s4
-; CHECK-NEXT:    vcvt.u32.f32 s1, s1
-; CHECK-NEXT:    vcvt.u32.f32 s4, s4
-; CHECK-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-NEXT:    vcvtb.f32.f16 s14, s1
+; CHECK-NEXT:    vldr s6, .LCPI46_1
+; CHECK-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-NEXT:    vmaxnm.f32 s14, s14, s4
+; CHECK-NEXT:    vmaxnm.f32 s12, s12, s4
+; CHECK-NEXT:    vminnm.f32 s14, s14, s6
+; CHECK-NEXT:    vminnm.f32 s12, s12, s6
 ; CHECK-NEXT:    vcvt.u32.f32 s14, s14
+; CHECK-NEXT:    vcvtb.f32.f16 s10, s0
 ; CHECK-NEXT:    vcvt.u32.f32 s12, s12
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vmaxnm.f32 s10, s10, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    vminnm.f32 s10, s10, s6
 ; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vcvt.u32.f32 s10, s10
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vcvt.u32.f32 s8, s8
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    lsrs r2, r1, #14
-; CHECK-NEXT:    orr.w r12, r2, r3, lsl #5
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    strh.w r12, [r0, #16]
-; CHECK-NEXT:    lsrs r2, r3, #1
-; CHECK-NEXT:    orr.w lr, r2, r1, lsl #18
+; CHECK-NEXT:    vcvtt.f32.f16 s8, s2
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-NEXT:    vmaxnm.f32 s8, s8, s4
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    lsrs r1, r2, #7
-; CHECK-NEXT:    orr.w r1, r1, r4, lsl #12
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #31
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    lsrs r4, r3, #13
-; CHECK-NEXT:    orr.w r4, r4, r5, lsl #6
-; CHECK-NEXT:    orr.w r2, r4, r2, lsl #25
+; CHECK-NEXT:    vmaxnm.f32 s2, s2, s4
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-NEXT:    vminnm.f32 s8, s8, s6
+; CHECK-NEXT:    vcvt.u32.f32 s8, s8
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
+; CHECK-NEXT:    lsll r12, r1, #19
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #6
+; CHECK-NEXT:    lsrl r2, r5, #26
+; CHECK-NEXT:    orr.w r1, r1, r4, lsl #25
+; CHECK-NEXT:    str r1, [r0, #4]
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    lsrl r4, r11, #7
+; CHECK-NEXT:    orr.w r1, r1, r12
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    orr.w r1, r2, r4
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    lsll r2, r7, #12
 ; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    orr.w r3, r4, r3, lsl #19
-; CHECK-NEXT:    strd r3, r2, [r0]
-; CHECK-NEXT:    strd r1, lr, [r0, #8]
-; CHECK-NEXT:    lsr.w r1, r12, #16
-; CHECK-NEXT:    strb r1, [r0, #18]
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    orrs r2, r1
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r4, r1, #31
+; CHECK-NEXT:    orr.w r12, r2, r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
+; CHECK-NEXT:    lsll r4, r3, #5
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    lsrl r6, r9, #14
+; CHECK-NEXT:    orr.w r3, r6, r4
+; CHECK-NEXT:    strh r3, [r0, #16]
+; CHECK-NEXT:    str.w r12, [r0, #8]
+; CHECK-NEXT:    lsrs r3, r3, #16
+; CHECK-NEXT:    strb r3, [r0, #18]
+; CHECK-NEXT:    orr.w r3, r5, r11
+; CHECK-NEXT:    orrs r3, r7
+; CHECK-NEXT:    orrs r1, r3
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #18
+; CHECK-NEXT:    str r1, [r0, #12]
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI46_0:
-; CHECK-NEXT:    .long 0x48ffffe0 @ float 524287
-; CHECK-NEXT:  .LCPI46_1:
 ; CHECK-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEXT:  .LCPI46_1:
+; CHECK-NEXT:    .long 0x48ffffe0 @ float 524287
     %x = call <8 x i19> @llvm.fptoui.sat.v8f16.v8i19(<8 x half> %f)
     ret <8 x i19> %x
 }
@@ -3749,46 +3768,46 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vcvtb.f32.f16 s24, s18
 ; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s19
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov r0, s28
 ; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s19
 ; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov r9, r1
 ; CHECK-NEXT:    vmov r5, s22
 ; CHECK-NEXT:    vldr s20, .LCPI48_0
-; CHECK-NEXT:    vmov r8, s18
+; CHECK-NEXT:    vmov r11, s26
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
+; CHECK-NEXT:    movwgt r4, #65535
+; CHECK-NEXT:    movtgt r4, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    str.w r7, [r4, #25]
+; CHECK-NEXT:    str.w r7, [r10, #25]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
@@ -3797,188 +3816,184 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    str.w r0, [r10]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    lsl.w r0, r10, #22
-; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    orr.w r6, r0, r6, lsr #10
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    lsll r6, r7, #22
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s18
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    orr.w r0, r1, r7
+; CHECK-NEXT:    str.w r0, [r10, #45]
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    orrs r6, r2
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
+; CHECK-NEXT:    lsrs r0, r4, #10
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str.w r6, [r10, #41]
+; CHECK-NEXT:    strb.w r0, [r10, #49]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #45]
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    lsrs r0, r7, #14
-; CHECK-NEXT:    orr.w r0, r0, r5, lsl #18
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
-; CHECK-NEXT:    str.w r0, [r4, #33]
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    lsrl r4, r1, #14
+; CHECK-NEXT:    orr.w r6, r1, r5, lsl #4
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r9, #65535
-; CHECK-NEXT:    movtgt r9, #3
-; CHECK-NEXT:    lsl.w r0, r9, #22
-; CHECK-NEXT:    orr.w r0, r0, r1, lsr #10
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    str r0, [r4, #20]
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    str.w r6, [r10, #37]
+; CHECK-NEXT:    str.w r4, [r10, #33]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r11, #65535
-; CHECK-NEXT:    movtgt r11, #3
+; CHECK-NEXT:    movwgt r6, #65535
+; CHECK-NEXT:    movtgt r6, #3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
-; CHECK-NEXT:    lsr.w r0, r8, #14
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s19
-; CHECK-NEXT:    orr.w r0, r0, r11, lsl #18
-; CHECK-NEXT:    str r0, [r4, #8]
-; CHECK-NEXT:    lsr.w r0, r10, #10
-; CHECK-NEXT:    strb.w r0, [r4, #49]
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r9, #65535
+; CHECK-NEXT:    movtgt r9, #3
+; CHECK-NEXT:    bfc r9, #18, #14
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    orr.w r0, r9, r7, lsl #18
+; CHECK-NEXT:    str.w r0, [r10, #29]
 ; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    ubfx r0, r5, #14, #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    bfc r1, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r0, #65535
-; CHECK-NEXT:    movtgt r0, #3
-; CHECK-NEXT:    bfc r0, #18, #14
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    orr.w r0, r0, r7, lsl #18
-; CHECK-NEXT:    str.w r0, [r4, #29]
-; CHECK-NEXT:    lsr.w r0, r9, #10
-; CHECK-NEXT:    strb r0, [r4, #24]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    ubfx r2, r11, #14, #4
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt.w r11, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    bfc r5, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    orr.w r2, r2, r0, lsl #4
-; CHECK-NEXT:    str r2, [r4, #12]
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    mov r8, r11
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    lsll r4, r1, #22
+; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r2, #65535
-; CHECK-NEXT:    movtgt r2, #3
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #3
+; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    str.w r1, [r10, #20]
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    orr.w r2, r8, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    bfc r7, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    bfc r2, #18, #14
+; CHECK-NEXT:    str.w r2, [r10, #16]
+; CHECK-NEXT:    lsrs r2, r6, #10
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    strb.w r2, [r10, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #3
-; CHECK-NEXT:    orr.w r2, r2, r8, lsl #18
-; CHECK-NEXT:    str r2, [r4, #4]
-; CHECK-NEXT:    bfc r10, #18, #14
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsrs r2, r6, #28
 ; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    orr.w r2, r2, r10, lsl #4
-; CHECK-NEXT:    lsrs r0, r0, #28
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #22
-; CHECK-NEXT:    str.w r2, [r4, #41]
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #22
-; CHECK-NEXT:    str r0, [r4, #16]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    orr.w r0, r7, r0, lsl #18
+; CHECK-NEXT:    lsrl r2, r1, #14
+; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r10, #8]
+; CHECK-NEXT:    str.w r0, [r10, #4]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -4192,21 +4207,61 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-LABEL: test_unsigned_v8f16_v8i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
-; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vmov r3, s26
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s19
+; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
 ; CHECK-NEXT:    vldr s20, .LCPI50_1
+; CHECK-NEXT:    vmov r8, s22
+; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
+; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    str.w r2, [r9, #83]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str.w r1, [r9, #79]
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    str.w r0, [r9, #75]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s19
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
@@ -4217,7 +4272,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r4, #83]
+; CHECK-NEXT:    str.w r2, [r9, #58]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4225,23 +4280,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #79]
+; CHECK-NEXT:    str.w r1, [r9, #54]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s17
-; CHECK-NEXT:    str.w r0, [r4, #75]
-; CHECK-NEXT:    vmov r9, s30
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    vmov r6, s24
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    str.w r0, [r9, #50]
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
@@ -4252,7 +4302,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r4, #58]
+; CHECK-NEXT:    str.w r2, [r9, #33]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4260,18 +4310,18 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #54]
+; CHECK-NEXT:    str.w r1, [r9, #29]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #50]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    str.w r0, [r9, #25]
+; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    mov r10, r3
+; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    it lt
@@ -4282,7 +4332,7 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    str.w r2, [r9, #8]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -4290,227 +4340,200 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    str.w r1, [r9, #4]
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r4, #25]
+; CHECK-NEXT:    str.w r0, [r9]
 ; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    mov r8, r3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    and r0, r7, #15
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    str.w r1, [r9, #87]
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    mov r7, r3
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movgt r5, #15
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    orr.w r0, r0, r8, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #62]
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #95]
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    str.w r1, [r4, #91]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    lsrs r1, r2, #28
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s18
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r1, r3, lsl #4
-; CHECK-NEXT:    vmov r1, s30
-; CHECK-NEXT:    strb.w r2, [r4, #99]
-; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r2, r5, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #87]
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    movgt r0, #15
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
+; CHECK-NEXT:    str.w r0, [r9, #37]
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
+; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r6, r1, #28
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r6, r6, r2, lsl #4
+; CHECK-NEXT:    movgt r4, #15
+; CHECK-NEXT:    and r5, r4, #15
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    orr.w r5, r5, r0, lsl #4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r4, #70]
+; CHECK-NEXT:    str.w r5, [r9, #12]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r6, r0, #28
-; CHECK-NEXT:    orr.w r1, r6, r1, lsl #4
-; CHECK-NEXT:    str.w r1, [r4, #66]
-; CHECK-NEXT:    vmov r1, s18
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    lsrs r2, r2, #28
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    lsrl r6, r11, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    orr.w r5, r11, r10, lsl #4
+; CHECK-NEXT:    str.w r5, [r9, #95]
+; CHECK-NEXT:    str.w r6, [r9, #91]
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #74]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r2, r7, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #62]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    movgt r6, #15
+; CHECK-NEXT:    and r5, r6, #15
 ; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    lsrl r10, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    strb.w r10, [r9, #99]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r7, r1, #28
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r4, #45]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r7
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    orr.w r7, r7, r1, lsl #4
-; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r7, [r4, #41]
+; CHECK-NEXT:    lsrl r8, r5, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r2, r2, #28
-; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r2, r2, r3, lsl #4
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    orr.w r6, r5, r4, lsl #4
+; CHECK-NEXT:    and r5, r7, #15
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    str.w r6, [r9, #70]
+; CHECK-NEXT:    str.w r8, [r9, #66]
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    strb.w r4, [r9, #74]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #15
-; CHECK-NEXT:    and r2, r10, #15
-; CHECK-NEXT:    orr.w r0, r2, r0, lsl #4
-; CHECK-NEXT:    str.w r0, [r4, #37]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    movgt r7, #15
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    b.w .LBB50_2
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -4518,47 +4541,61 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .long 0x717fffff @ float 1.26765052E+30
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vmov q0[3], q0[1], r7, r12
+; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    lsrl r4, r5, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    lsrs r7, r1, #28
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
+; CHECK-NEXT:    and r5, r12, #15
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    orr.w r7, r7, r2, lsl #4
-; CHECK-NEXT:    str r7, [r4, #20]
+; CHECK-NEXT:    str.w r7, [r9, #45]
+; CHECK-NEXT:    str.w r4, [r9, #41]
+; CHECK-NEXT:    strb.w r6, [r9, #49]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    lsrs r7, r0, #28
-; CHECK-NEXT:    orr.w r1, r7, r1, lsl #4
+; CHECK-NEXT:    movgt r3, #15
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r1, [r4, #16]
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsr.w r1, r2, #28
-; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    orr.w r1, r1, r3, lsl #4
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vmov r1, s1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb r1, [r4, #24]
+; CHECK-NEXT:    lsrl r0, r1, #28
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #15
-; CHECK-NEXT:    and r1, r8, #15
-; CHECK-NEXT:    orr.w r0, r1, r0, lsl #4
-; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r9, #16]
+; CHECK-NEXT:    and r1, r3, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  @ %bb.3:
     %x = call <8 x i100> @llvm.fptoui.sat.v8f16.v8i100(<8 x half> %f)
     ret <8 x i100> %x
diff --git a/llvm/test/CodeGen/X86/GlobalISel/br.ll b/llvm/test/CodeGen/X86/GlobalISel/br.ll
deleted file mode 100644
index 878fe981c988..000000000000
--- a/llvm/test/CodeGen/X86/GlobalISel/br.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs %s -o - | FileCheck %s
-
-define void @uncondbr() {
-; CHECK-LABEL: uncondbr:
-; CHECK:       # %bb.1: # %entry
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_2: # %end
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_3: # %bb2
-; CHECK-NEXT:    jmp .LBB0_2
-entry:
-  br label %bb2
-end:
-  ret void
-bb2:
-  br label %end
-}
-
diff --git a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll b/llvm/test/CodeGen/X86/GlobalISel/brcond.ll
deleted file mode 100644
index b38fbfdcc83c..000000000000
--- a/llvm/test/CodeGen/X86/GlobalISel/brcond.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i386-linux-gnu      -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X86
-
-define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
-; X64-LABEL: test_1:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setl %al
-; X64-NEXT:    testb $1, %al
-; X64-NEXT:    je .LBB0_2
-; X64-NEXT:  # %bb.1: # %if.then
-; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_2: # %if.else
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_1:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    setl %al
-; X86-NEXT:    testb $1, %al
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.1: # %if.then
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    jmp .LBB0_3
-; X86-NEXT:  .LBB0_2: # %if.else
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:  .LBB0_3: # %return
-; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
-entry:
-  %retval = alloca i32, align 4
-  %cmp = icmp slt i32 %a, %b
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  store i32 %tValue, ptr %retval, align 4
-  br label %return
-
-if.else:
-  store i32 %fValue, ptr %retval, align 4
-  br label %return
-
-return:
-  %0 = load i32, ptr %retval, align 4
-  ret i32 %0
-}
-
-define i32 @test_2(i32 %a) {
-; X64-LABEL: test_2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    je .LBB1_2
-; X64-NEXT:  # %bb.1: # %if.then
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB1_2: # %if.else
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb $1, %al
-; X86-NEXT:    je .LBB1_2
-; X86-NEXT:  # %bb.1: # %if.then
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB1_2: # %if.else
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    retl
-entry:
-  %cmp = trunc i32 %a to i1
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  ret i32 0
-if.else:
-  ret i32 1
-}
-
diff --git a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
index 119821e91b38..dae27ffe6c5b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
@@ -44,7 +44,7 @@ define fp128 @roundeven_f128(fp128 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq roundevenl
+; CHECK-NEXT:    callq roundevenf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
new file mode 100644
index 000000000000..0ce346f7c2e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64 -- | FileCheck %s
+
+define i32 @h(i1 %arg, i32 %arg1) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movabsq $9166129423, %rcx # imm = 0x22258090F
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %bb18
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  .LBB0_1: # %bb4
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.7: # %bb16
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    jmp .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %bb9
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %bb13
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_4: # %bb14
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmpl $1, %esi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.5: # %bb14
+; CHECK-NEXT:    movl %eax, %r8d
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    jne .LBB0_6
+; CHECK-NEXT:  .LBB0_10: # %bb22
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_6: # %bb22.loopexit1
+; CHECK-NEXT:    movl %r8d, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb14, %bb
+  %i = phi i64 [ %i5, %bb14 ], [ 0, %bb ]
+  %i3 = phi i32 [ %i15, %bb14 ], [ 1, %bb ]
+  br label %bb4
+
+bb4:                                              ; preds = %bb18, %bb2
+  %i5 = phi i64 [ %i19, %bb18 ], [ %i, %bb2 ]
+  %i6 = phi i64 [ %i20, %bb18 ], [ %i, %bb2 ]
+  %i7 = phi i32 [ 0, %bb18 ], [ %i3, %bb2 ]
+  %i8 = icmp eq i64 %i6, 0
+  br i1 %i8, label %bb16, label %bb9
+
+bb9:                                              ; preds = %bb4
+  br i1 %arg, label %bb12, label %bb10
+
+bb10:                                             ; preds = %bb9
+  %i11 = sdiv i64 0, 0
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb9
+  br i1 %arg, label %bb13, label %bb14
+
+bb13:                                             ; preds = %bb12
+  br label %bb14
+
+bb14:                                             ; preds = %bb13, %bb12
+  %i15 = phi i32 [ 0, %bb13 ], [ %i7, %bb12 ]
+  switch i32 %arg1, label %bb22 [
+    i32 0, label %bb21
+    i32 1, label %bb2
+  ]
+
+bb16:                                             ; preds = %bb4
+  br i1 %arg, label %bb18, label %bb17
+
+bb17:                                             ; preds = %bb16
+  br label %bb18
+
+bb18:                                             ; preds = %bb17, %bb16
+  %i19 = phi i64 [ 9166129423, %bb17 ], [ %i5, %bb16 ]
+  %i20 = phi i64 [ 9166129423, %bb17 ], [ %i6, %bb16 ]
+  br i1 %arg, label %bb22, label %bb4
+
+bb21:                                             ; preds = %bb14
+  br label %bb22
+
+bb22:                                             ; preds = %bb21, %bb18, %bb14
+  %i23 = phi i32 [ %arg1, %bb21 ], [ %i15, %bb14 ], [ 0, %bb18 ]
+  ret i32 %i23
+}
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index e0a96a9f9887..97c634a2a133 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X86-POPCOUNT
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s -check-prefixes=X64-POPCOUNT
 ; RUN: llc < %s -mtriple=i686-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 -mattr=-popcnt | FileCheck %s -check-prefixes=X86-NO-POPCOUNT
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-popcnt | FileCheck %s -check-prefixes=X64-NO-POPCOUNT
 
 declare i8 @llvm.ctpop.i8(i8) nounwind readnone
@@ -662,11 +663,12 @@ define i64 @ctpop_shifted_mask16(i64 %x) nounwind readnone {
 ; X86-NO-POPCOUNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-POPCOUNT-NEXT:    movl %ecx, %eax
 ; X86-NO-POPCOUNT-NEXT:    andl $524280, %eax # imm = 0x7FFF8
-; X86-NO-POPCOUNT-NEXT:    shrl %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $87380, %ecx # imm = 0x15554
+; X86-NO-POPCOUNT-NEXT:    shrl $4, %ecx
+; X86-NO-POPCOUNT-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X86-NO-POPCOUNT-NEXT:    shrl $3, %eax
 ; X86-NO-POPCOUNT-NEXT:    subl %ecx, %eax
 ; X86-NO-POPCOUNT-NEXT:    movl %eax, %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $858993456, %ecx # imm = 0x33333330
+; X86-NO-POPCOUNT-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NO-POPCOUNT-NEXT:    shrl $2, %eax
 ; X86-NO-POPCOUNT-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NO-POPCOUNT-NEXT:    addl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
deleted file mode 100644
index 475d8fcf7f35..000000000000
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ /dev/null
@@ -1,293 +0,0 @@
-; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-
-define i32 @fcmp_oeq(float %x, float %y) {
-; CHECK-LABEL: fcmp_oeq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ogt
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp ogt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge(float %x, float %y) {
-; CHECK-LABEL: fcmp_oge
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp oge float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt(float %x, float %y) {
-; CHECK-LABEL: fcmp_olt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp olt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole(float %x, float %y) {
-; CHECK-LABEL: fcmp_ole
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp ole float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one(float %x, float %y) {
-; CHECK-LABEL: fcmp_one
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = fcmp one float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord(float %x, float %y) {
-; CHECK-LABEL: fcmp_ord
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno(float %x, float %y) {
-; CHECK-LABEL: fcmp_uno
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq(float %x, float %y) {
-; CHECK-LABEL: fcmp_ueq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_2}}
-  %1 = fcmp ueq float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ugt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ugt float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge(float %x, float %y) {
-; CHECK-LABEL: fcmp_uge
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp uge float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult(float %x, float %y) {
-; CHECK-LABEL: fcmp_ult
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ult float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule(float %x, float %y) {
-; CHECK-LABEL: fcmp_ule
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp ule float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une(float %x, float %y) {
-; CHECK-LABEL: fcmp_une
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_eq(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_eq
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jne {{LBB.+_1}}
-  %1 = icmp eq i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ne(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ne
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = icmp ne i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ugt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ugt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = icmp ugt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_uge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_uge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = icmp uge i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ult(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ult
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = icmp ult i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ule(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ule
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = icmp ule i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sgt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sgt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jle {{LBB.+_1}}
-  %1 = icmp sgt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jl {{LBB.+_1}}
-  %1 = icmp sge i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_slt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_slt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jge {{LBB.+_1}}
-  %1 = icmp slt i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sle(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sle
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jg {{LBB.+_1}}
-  %1 = icmp sle i32 %x, %y
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll
deleted file mode 100644
index 8f09b2e38356..000000000000
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ /dev/null
@@ -1,469 +0,0 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-
-define i32 @fcmp_oeq1(float %x) {
-; CHECK-LABEL: fcmp_oeq1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oeq2(float %x) {
-; CHECK-LABEL: fcmp_oeq2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oeq float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt1(float %x) {
-; CHECK-LABEL: fcmp_ogt1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp ogt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ogt2(float %x) {
-; CHECK-LABEL: fcmp_ogt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp ogt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge1(float %x) {
-; CHECK-LABEL: fcmp_oge1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp oge float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_oge2(float %x) {
-; CHECK-LABEL: fcmp_oge2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp oge float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt1(float %x) {
-; CHECK-LABEL: fcmp_olt1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp olt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_olt2(float %x) {
-; CHECK-LABEL: fcmp_olt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jbe {{LBB.+_1}}
-  %1 = fcmp olt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole1(float %x) {
-; CHECK-LABEL: fcmp_ole1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ole float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ole2(float %x) {
-; CHECK-LABEL: fcmp_ole2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jb {{LBB.+_1}}
-  %1 = fcmp ole float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one1(float %x) {
-; CHECK-LABEL: fcmp_one1
-; CHECK-NOT:   ucomiss
-; CHECK:       movl $1, %eax
-  %1 = fcmp one float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_one2(float %x) {
-; CHECK-LABEL: fcmp_one2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_1}}
-  %1 = fcmp one float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord1(float %x) {
-; CHECK-LABEL: fcmp_ord1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ord2(float %x) {
-; CHECK-LABEL: fcmp_ord2
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
-  %1 = fcmp ord float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno1(float %x) {
-; CHECK-LABEL: fcmp_uno1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uno2(float %x) {
-; CHECK-LABEL: fcmp_uno2
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
-  %1 = fcmp uno float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq1(float %x) {
-; CHECK-LABEL: fcmp_ueq1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp ueq float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ueq2(float %x) {
-; CHECK-LABEL: fcmp_ueq2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_2}}
-  %1 = fcmp ueq float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt1(float %x) {
-; CHECK-LABEL: fcmp_ugt1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp ugt float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ugt2(float %x) {
-; CHECK-LABEL: fcmp_ugt2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ugt float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge1(float %x) {
-; CHECK-LABEL: fcmp_uge1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp uge float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_uge2(float %x) {
-; CHECK-LABEL: fcmp_uge2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp uge float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult1(float %x) {
-; CHECK-LABEL: fcmp_ult1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp ult float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ult2(float %x) {
-; CHECK-LABEL: fcmp_ult2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jae {{LBB.+_1}}
-  %1 = fcmp ult float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule1(float %x) {
-; CHECK-LABEL: fcmp_ule1
-; CHECK-NOT:   ucomiss
-  %1 = fcmp ule float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_ule2(float %x) {
-; CHECK-LABEL: fcmp_ule2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  ja {{LBB.+_1}}
-  %1 = fcmp ule float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une1(float %x) {
-; CHECK-LABEL: fcmp_une1
-; CHECK:       ucomiss  %xmm0, %xmm0
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @fcmp_une2(float %x) {
-; CHECK-LABEL: fcmp_une2
-; CHECK:       xorps    %xmm1, %xmm1
-; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jnp {{LBB.+_1}}
-  %1 = fcmp une float %x, 0.000000e+00
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_eq(i32 %x) {
-; CHECK-LABEL: icmp_eq
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp eq i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ne(i32 %x) {
-; CHECK-LABEL: icmp_ne
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ne i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ugt(i32 %x) {
-; CHECK-LABEL: icmp_ugt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ugt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_uge(i32 %x) {
-; CHECK-LABEL: icmp_uge
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp uge i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ult(i32 %x) {
-; CHECK-LABEL: icmp_ult
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp ult i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_ule(i32 %x) {
-; CHECK-LABEL: icmp_ule
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp ule i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sgt(i32 %x) {
-; CHECK-LABEL: icmp_sgt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp sgt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sge(i32 %x) {
-; CHECK-LABEL: icmp_sge
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp sge i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_slt(i32 %x) {
-; CHECK-LABEL: icmp_slt
-; CHECK-NOT:   cmpl
-; CHECK:       movl $1, %eax
-  %1 = icmp slt i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
-define i32 @icmp_sle(i32 %x) {
-; CHECK-LABEL: icmp_sle
-; CHECK-NOT:   cmpl
-; CHECK:       xorl %eax, %eax
-  %1 = icmp sle i32 %x, %x
-  br i1 %1, label %bb1, label %bb2
-bb2:
-  ret i32 1
-bb1:
-  ret i32 0
-}
-
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 4722ce62f6e5..47234c365cad 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -163,7 +163,7 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; CHECK-LABEL: fma:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmal@PLT
+; CHECK-NEXT:    callq fmaf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -204,7 +204,7 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: frem:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -241,7 +241,7 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: ceil:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq ceill@PLT
+; CHECK-NEXT:    callq ceilf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -274,7 +274,7 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: cos:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq cosl@PLT
+; CHECK-NEXT:    callq cosf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -307,7 +307,7 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: exp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq expl@PLT
+; CHECK-NEXT:    callq expf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -340,7 +340,7 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: exp2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq exp2l@PLT
+; CHECK-NEXT:    callq exp2f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -373,7 +373,7 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: floor:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq floorl@PLT
+; CHECK-NEXT:    callq floorf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -406,7 +406,7 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq logl@PLT
+; CHECK-NEXT:    callq logf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -439,7 +439,7 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log10:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq log10l@PLT
+; CHECK-NEXT:    callq log10f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -472,7 +472,7 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: log2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq log2l@PLT
+; CHECK-NEXT:    callq log2f128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -505,7 +505,7 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: maxnum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmaxl@PLT
+; CHECK-NEXT:    callq fmaxf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -542,7 +542,7 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: minnum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fminl@PLT
+; CHECK-NEXT:    callq fminf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -579,7 +579,7 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: nearbyint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq nearbyintl@PLT
+; CHECK-NEXT:    callq nearbyintf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -612,7 +612,7 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-LABEL: pow:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq powl@PLT
+; CHECK-NEXT:    callq powf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -683,7 +683,7 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: rint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq rintl@PLT
+; CHECK-NEXT:    callq rintf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -716,7 +716,7 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: round:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundl@PLT
+; CHECK-NEXT:    callq roundf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -749,7 +749,7 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: roundeven:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundevenl@PLT
+; CHECK-NEXT:    callq roundevenf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -782,7 +782,7 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: sin:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sinl@PLT
+; CHECK-NEXT:    callq sinf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -815,7 +815,7 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: sqrt:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sqrtl@PLT
+; CHECK-NEXT:    callq sqrtf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -848,7 +848,7 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: trunc:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq truncl@PLT
+; CHECK-NEXT:    callq truncf128
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ;
@@ -881,7 +881,7 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: lrint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq lrintl@PLT
+; CHECK-NEXT:    callq lrintf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -904,7 +904,7 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: llrint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq llrintl@PLT
+; CHECK-NEXT:    callq llrintf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -927,7 +927,7 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: lround:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq lroundl@PLT
+; CHECK-NEXT:    callq lroundf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
@@ -950,7 +950,7 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: llround:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq llroundl@PLT
+; CHECK-NEXT:    callq llroundf128
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index 4e7e6b4d0194..6946ca2d575b 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -299,7 +299,7 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; CHECK-LABEL: Test128Rem:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -335,7 +335,7 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps vf128(%rip), %xmm0
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -370,7 +370,7 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Sqrt:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sqrtl@PLT
+; CHECK-NEXT:    callq sqrtf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -401,7 +401,7 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Sin:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sinl@PLT
+; CHECK-NEXT:    callq sinf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -432,7 +432,7 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Cos:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq cosl@PLT
+; CHECK-NEXT:    callq cosf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -463,7 +463,7 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Ceil:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq ceill@PLT
+; CHECK-NEXT:    callq ceilf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -494,7 +494,7 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Floor:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq floorl@PLT
+; CHECK-NEXT:    callq floorf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -525,7 +525,7 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Trunc:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq truncl@PLT
+; CHECK-NEXT:    callq truncf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -556,7 +556,7 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Nearbyint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq nearbyintl@PLT
+; CHECK-NEXT:    callq nearbyintf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -587,7 +587,7 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Rint:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq rintl@PLT
+; CHECK-NEXT:    callq rintf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -618,7 +618,7 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; CHECK-LABEL: Test128Round:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq roundl@PLT
+; CHECK-NEXT:    callq roundf128
 ; CHECK-NEXT:    movaps %xmm0, vf128(%rip)
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -648,7 +648,7 @@ declare fp128 @llvm.round.f128(fp128)
 define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; CHECK-LABEL: Test128FMA:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp fmal@PLT # TAILCALL
+; CHECK-NEXT:    jmp fmaf128@PLT # TAILCALL
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll
index d91d4289a599..35d16c3bac70 100644
--- a/llvm/test/CodeGen/X86/frem.ll
+++ b/llvm/test/CodeGen/X86/frem.ll
@@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    callq fmodl@PLT
+; CHECK-NEXT:    callq fmodf128
 ; CHECK-NEXT:    vmovaps %xmm0, (%rbx)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/isel-br.ll b/llvm/test/CodeGen/X86/isel-br.ll
new file mode 100644
index 000000000000..5388c89e1819
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-br.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel=0 -verify-machineinstrs | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1        | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel=0                     | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1      | FileCheck %s --check-prefix=DAG
+; RUN: llc < %s -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1  | FileCheck %s --check-prefix=GISEL
+
+define void @uncondbr() {
+; DAG-LABEL: uncondbr:
+; DAG:       # %bb.0: # %entry
+; DAG-NEXT:    jmp .LBB0_2
+; DAG-NEXT:  .LBB0_1: # %end
+; DAG-NEXT:    ret{{[l|q]}}
+; DAG-NEXT:  .LBB0_2: # %bb2
+; DAG-NEXT:    jmp .LBB0_1
+;
+; GISEL-LABEL: uncondbr:
+; GISEL:       # %bb.1: # %entry
+; GISEL-NEXT:    jmp .LBB0_3
+; GISEL-NEXT:  .LBB0_2: # %end
+; GISEL-NEXT:    ret{{[l|q]}}
+; GISEL-NEXT:  .LBB0_3: # %bb2
+; GISEL-NEXT:    jmp .LBB0_2
+entry:
+  br label %bb2
+end:
+  ret void
+bb2:
+  br label %end
+}
diff --git a/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll
new file mode 100644
index 000000000000..5a28e094f8a3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-brcond-fcmp.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -global-isel=0                    -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X64
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; X64-LABEL: fcmp_oeq:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB0_1
+; X64-NEXT:    jp LBB0_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB0_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB0_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB0_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; X64-LABEL: fcmp_ogt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jbe LBB1_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB1_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ogt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB1_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB1_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; X64-LABEL: fcmp_oge:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jb LBB2_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB2_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB2_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB2_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; X64-LABEL: fcmp_olt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jbe LBB3_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB3_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_olt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB3_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB3_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; X64-LABEL: fcmp_ole:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jb LBB4_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB4_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB4_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB4_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; X64-LABEL: fcmp_one:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB5_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB5_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_one:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB5_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB5_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; X64-LABEL: fcmp_ord:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jp LBB6_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB6_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB6_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB6_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; X64-LABEL: fcmp_uno:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jp LBB7_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB7_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB7_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB7_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; X64-LABEL: fcmp_ueq:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB8_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB8_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ueq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB8_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB8_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; X64-LABEL: fcmp_ugt:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jae LBB9_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB9_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB9_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB9_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; X64-LABEL: fcmp_uge:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    ja LBB10_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB10_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB10_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB10_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; X64-LABEL: fcmp_ult:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jae LBB11_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB11_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB11_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB11_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; X64-LABEL: fcmp_ule:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    ja LBB12_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB12_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ule:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB12_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB12_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; X64-LABEL: fcmp_une:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB13_2
+; X64-NEXT:    jnp LBB13_1
+; X64-NEXT:  LBB13_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB13_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB13_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB13_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq1(float %x) {
+; X64-LABEL: fcmp_oeq1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB14_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB14_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB14_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB14_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq2(float %x) {
+; X64-LABEL: fcmp_oeq2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB15_1
+; X64-NEXT:    jp LBB15_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB15_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oeq2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    setnp %cl
+; GISEL-X64-NEXT:    andb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB15_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB15_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oeq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt1(float %x) {
+; SDAG-X64-LABEL: fcmp_ogt1:
+; SDAG-X64:       ## %bb.0:
+; SDAG-X64-NEXT:    xorl    %eax, %eax
+; SDAG-X64-NEXT:    testb   %al, %al
+; SDAG-X64-NEXT:    je      LBB16_1
+; SDAG-X64-NEXT:  ## %bb.2: ## %bb1
+; SDAG-X64-NEXT:    xorl    %eax, %eax
+; SDAG-X64-NEXT:    retq
+; SDAG-X64-NEXT:  LBB16_1: ## %bb2
+; SDAG-X64-NEXT:    movl    $1, %eax
+; SDAG-X64-NEXT:    retq
+
+; FASTISEL-X64-LABEL: fcmp_ogt1:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64:         movl    $1, %eax
+; FASTISEL-X64:         retq
+
+; GISEL-X64-LABEL: fcmp_ogt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB16_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB16_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt2(float %x) {
+; X64-LABEL: fcmp_ogt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jbe LBB17_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB17_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ogt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB17_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB17_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ogt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge1(float %x) {
+; X64-LABEL: fcmp_oge1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB18_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB18_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB18_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB18_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge2(float %x) {
+; X64-LABEL: fcmp_oge2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jb LBB19_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB19_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_oge2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB19_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB19_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp oge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt1(float %x) {
+; GISEL-X64-LABEL: fcmp_olt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB20_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB20_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt2(float %x) {
+; X64-LABEL: fcmp_olt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jbe LBB21_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB21_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_olt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB21_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB21_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp olt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole1(float %x) {
+; X64-LABEL: fcmp_ole1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB22_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB22_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB22_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB22_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole2(float %x) {
+; X64-LABEL: fcmp_ole2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jb LBB23_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB23_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ole2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB23_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB23_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ole float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one1(float %x) {
+; GISEL-X64-LABEL: fcmp_one1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB24_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB24_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one2(float %x) {
+; X64-LABEL: fcmp_one2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB25_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB25_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_one2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB25_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB25_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp one float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord1(float %x) {
+; X64-LABEL: fcmp_ord1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB26_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB26_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB26_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB26_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord2(float %x) {
+; X64-LABEL: fcmp_ord2:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB27_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB27_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ord2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setnp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB27_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB27_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ord float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno1(float %x) {
+; X64-LABEL: fcmp_uno1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB28_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB28_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB28_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB28_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno2(float %x) {
+; X64-LABEL: fcmp_uno2:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jp LBB29_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB29_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uno2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setp %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB29_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB29_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uno float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq1(float %x) {
+; GISEL-X64-LABEL: fcmp_ueq1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB30_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB30_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq2(float %x) {
+; X64-LABEL: fcmp_ueq2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    je LBB31_2
+; X64-NEXT:  ## %bb.1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB31_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ueq2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    jne LBB31_2
+; GISEL-X64-NEXT:  ## %bb.1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB31_2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ueq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt1(float %x) {
+; X64-LABEL: fcmp_ugt1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB32_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB32_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB32_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB32_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt2(float %x) {
+; X64-LABEL: fcmp_ugt2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    jae LBB33_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB33_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ugt2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB33_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB33_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ugt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge1(float %x) {
+; GISEL-X64-LABEL: fcmp_uge1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB34_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB34_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge2(float %x) {
+; X64-LABEL: fcmp_uge2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm0, %xmm1
+; X64-NEXT:    ja LBB35_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB35_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_uge2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm1
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB35_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB35_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp uge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult1(float %x) {
+; X64-LABEL: fcmp_ult1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB36_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB36_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB36_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB36_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult2(float %x) {
+; X64-LABEL: fcmp_ult2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jae LBB37_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB37_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ult2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB37_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB37_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ult float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule1(float %x) {
+; GISEL-X64-LABEL: fcmp_ule1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB38_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB38_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule2(float %x) {
+; X64-LABEL: fcmp_ule2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    ja LBB39_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB39_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_ule2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB39_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB39_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp ule float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une1(float %x) {
+; X64-LABEL: fcmp_une1:
+; X64:       ## %bb.0:
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    jnp LBB40_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB40_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une1:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    ucomiss %xmm0, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB40_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB40_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une2(float %x) {
+; X64-LABEL: fcmp_une2:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    jne LBB41_2
+; X64-NEXT:    jnp LBB41_1
+; X64-NEXT:  LBB41_2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB41_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: fcmp_une2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    ucomiss %xmm1, %xmm0
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    setp %cl
+; GISEL-X64-NEXT:    orb %al, %cl
+; GISEL-X64-NEXT:    testb $1, %cl
+; GISEL-X64-NEXT:    je LBB41_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB41_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+  %1 = fcmp une float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/isel-brcond-icmp.ll b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll
new file mode 100644
index 000000000000..59a45d9d72f5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-brcond-icmp.ll
@@ -0,0 +1,1107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel=0                    -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=X64,FASTISEL
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -global-isel=0                    -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=X86,SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort=1     -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=X86,FASTISEL
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10   -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X86
+
+define i32 @icmp_eq_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_eq_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jne LBB0_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB0_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_eq_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB0_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB0_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_eq_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jne LBB0_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB0_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_eq_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    sete %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB0_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB0_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ne_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    je LBB1_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB1_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ne_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB1_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB1_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ne_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    je LBB1_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB1_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ne_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setne %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB1_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB1_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ugt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jbe LBB2_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB2_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ugt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB2_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB2_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ugt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jbe LBB2_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB2_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ugt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    seta %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB2_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB2_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_uge_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jb LBB3_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB3_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_uge_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB3_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB3_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_uge_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jb LBB3_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB3_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_uge_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setae %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB3_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB3_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ult_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jae LBB4_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB4_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ult_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB4_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB4_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ult_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jae LBB4_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB4_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ult_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setb %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB4_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB4_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_ule_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    ja LBB5_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB5_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_ule_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB5_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB5_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_ule_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    ja LBB5_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB5_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_ule_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setbe %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB5_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB5_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sgt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jle LBB6_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB6_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sgt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setg %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB6_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB6_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sgt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jle LBB6_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB6_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sgt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setg %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB6_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB6_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sge_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jl LBB7_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB7_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sge_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setge %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB7_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB7_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sge_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jl LBB7_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB7_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sge_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setge %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB7_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB7_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_slt_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jge LBB8_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB8_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_slt_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setl %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB8_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB8_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_slt_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jge LBB8_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB8_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_slt_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setl %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB8_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB8_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle_2(i32 %x, i32 %y) {
+; X64-LABEL: icmp_sle_2:
+; X64:       ## %bb.0:
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    jg LBB9_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB9_1: ## %bb2
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_sle_2:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setle %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB9_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB9_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; X86-LABEL: icmp_sle_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+\(%esp\), %eax|%eax, [0-9]+\(%esp\)}}
+; X86-NEXT:    jg LBB9_1
+; X86-NEXT:  ## %bb.2: ## %bb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  LBB9_1: ## %bb2
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: icmp_sle_2:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    setle %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB9_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB9_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x) {
+; SDAG-LABEL: icmp_eq:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB10_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB10_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_eq:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_eq:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    sete %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB10_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB10_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_eq:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    sete %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB10_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB10_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp eq i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x) {
+; SDAG-LABEL: icmp_ne:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB11_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB11_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ne:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ne:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB11_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB11_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ne:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setne %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB11_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB11_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ne i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x) {
+; SDAG-LABEL: icmp_ugt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB12_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB12_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ugt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ugt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    seta %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB12_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB12_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ugt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    seta %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB12_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB12_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ugt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x) {
+; SDAG-LABEL: icmp_uge:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB13_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB13_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-X64-LABEL: icmp_uge:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64-NEXT:    xorl %eax, %eax
+; FASTISEL-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: icmp_uge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setae %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB13_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB13_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_uge:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setae %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB13_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB13_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp uge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x) {
+; SDAG-LABEL: icmp_ult:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB14_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB14_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-X64-LABEL: icmp_ult:
+; FASTISEL-X64:       ## %bb.0:
+; FASTISEL-X64-NEXT:    movl $1, %eax
+; FASTISEL-X64-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ult:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setb %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB14_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB14_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ult:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setb %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB14_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB14_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ult i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x) {
+; SDAG-LABEL: icmp_ule:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB15_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB15_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_ule:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_ule:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setbe %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB15_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB15_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_ule:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setbe %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB15_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB15_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp ule i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x) {
+; SDAG-LABEL: icmp_sgt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB16_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB16_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sgt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sgt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setg %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB16_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB16_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sgt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setg %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB16_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB16_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sgt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x) {
+; SDAG-LABEL: icmp_sge:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB17_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB17_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sge:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sge:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setge %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB17_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB17_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sge:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setge %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB17_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB17_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x) {
+; SDAG-LABEL: icmp_slt:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB18_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB18_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_slt:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    movl $1, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_slt:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setl %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB18_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB18_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_slt:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setl %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB18_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB18_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp slt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x) {
+; SDAG-LABEL: icmp_sle:
+; SDAG:       ## %bb.0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    testb %al, %al
+; SDAG-NEXT:    je LBB19_1
+; SDAG-NEXT:  ## %bb.2: ## %bb1
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    ret{{q|l}}
+; SDAG-NEXT:  LBB19_1: ## %bb2
+; SDAG-NEXT:    movl $1, %eax
+; SDAG-NEXT:    ret{{q|l}}
+;
+; FASTISEL-LABEL: icmp_sle:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    xorl %eax, %eax
+; FASTISEL-NEXT:    ret{{q|l}}
+;
+; GISEL-X64-LABEL: icmp_sle:
+; GISEL-X64:       ## %bb.0:
+; GISEL-X64-NEXT:    cmpl %edi, %edi
+; GISEL-X64-NEXT:    setle %al
+; GISEL-X64-NEXT:    testb $1, %al
+; GISEL-X64-NEXT:    je LBB19_1
+; GISEL-X64-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    retq
+; GISEL-X64-NEXT:  LBB19_1: ## %bb2
+; GISEL-X64-NEXT:    movl $1, %eax
+; GISEL-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: icmp_sle:
+; GISEL-X86:       ## %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    cmpl %eax, %eax
+; GISEL-X86-NEXT:    setle %al
+; GISEL-X86-NEXT:    testb $1, %al
+; GISEL-X86-NEXT:    je LBB19_1
+; GISEL-X86-NEXT:  ## %bb.2: ## %bb1
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    retl
+; GISEL-X86-NEXT:  LBB19_1: ## %bb2
+; GISEL-X86-NEXT:    movl $1, %eax
+; GISEL-X86-NEXT:    retl
+  %1 = icmp sle i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index 7f8115dc1ce3..b5f3e7899188 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1282,3 +1282,35 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) {
   %tmp8 = or i32 %tmp7, %tmp30
   ret i32 %tmp8
 }
+
+define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
+; CHECK-LABEL: pr80911_vector_load_multiuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl (%edx), %esi
+; CHECK-NEXT:    movzwl (%edx), %eax
+; CHECK-NEXT:    movl $0, (%ecx)
+; CHECK-NEXT:    movl %esi, (%edx)
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: pr80911_vector_load_multiuse:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movl (%rdi), %ecx
+; CHECK64-NEXT:    movzwl (%rdi), %eax
+; CHECK64-NEXT:    movl $0, (%rsi)
+; CHECK64-NEXT:    movl %ecx, (%rdi)
+; CHECK64-NEXT:    retq
+  %load = load <4 x i8>, ptr %ptr, align 16
+  store i32 0, ptr %clobber
+  store <4 x i8> %load, ptr %ptr, align 16
+  %e1 = extractelement <4 x i8> %load, i64 1
+  %e1.ext = zext i8 %e1 to i32
+  %e1.ext.shift = shl nuw nsw i32 %e1.ext, 8
+  %e0 = extractelement <4 x i8> %load, i64 0
+  %e0.ext = zext i8 %e0 to i32
+  %res = or i32 %e1.ext.shift, %e0.ext
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
new file mode 100644
index 000000000000..b4ac3fc783e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
+
+define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
+; CHECK-LABEL: PR81136:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovd %edi, %xmm0
+; CHECK-NEXT:    vmovd %esi, %xmm1
+; CHECK-NEXT:    vmovdqa (%rdx), %ymm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm4 = [128,1]
+; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
+; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT:    vandnpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vmovmskpd %ymm0, %eax
+; CHECK-NEXT:    popcntl %eax, %eax
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <2 x i16>
+  %v1 = bitcast i32 %a1 to <2 x i16>
+  %cmp15 = icmp sle <2 x i16> %v1, %v0
+  %conv16 = sext <2 x i1> %cmp15 to <2 x i64>
+  %shuffle29 = shufflevector <2 x i64> %conv16, <2 x i64> <i64 128, i64 1>, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+  %data = load volatile <4 x i64>, ptr %a2, align 32
+  %cmp65 = icmp ne <4 x i64> %data, <i64 -2071602529, i64 -1537047284, i64 717942021, i64 597457239>
+  %cmp67 = icmp ne <4 x i64> %shuffle29, zeroinitializer
+  %and = and <4 x i1> %cmp65, %cmp67
+  %mask = bitcast <4 x i1> %and to i4
+  %cnt = tail call i4 @llvm.ctpop.i4(i4 %mask)
+  %cntz = zext i4 %cnt to i64
+  %res = sub nsw i64 0, %cntz
+  ret i64 %res
+}
+declare i4 @llvm.ctpop.i4(i4)
diff --git a/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll b/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll
new file mode 100644
index 000000000000..69e6976c3203
--- /dev/null
+++ b/llvm/test/CodeGen/X86/statepoint-relocate-undef.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @use(...)
+declare void @f()
+declare token @llvm.experimental.gc.statepoint.p0(i64, i32, ptr, i32, i32, ...)
+declare ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token, i32, i32)
+
+;; Check that llvm doesn't crash if relocate with undef base/derived ptr survives until isel
+define void @test_gcrelocate_undef(ptr addrspace(1) %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcrelocate_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rdi, (%rsp)
+; CHECK-NEXT:    callq f@PLT
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    movl $4278124286, %edi # imm = 0xFEFEFEFE
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %tok = tail call token (i64, i32, ptr, i32, i32, ...)
+      @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) @f, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %ptr, ptr addrspace(1) undef), "deopt" (ptr addrspace(1) %ptr, i32 undef)]
+  %a = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %tok, i32 0, i32 1)
+  call void (...) @use(ptr addrspace(1) %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 75bbae1050d6..401ed9f7bc5a 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -184,3 +184,186 @@ return:
   %retval = phi ptr [ %ptr, %if.then ], [ %obj, %entry ]
   ret ptr %retval
 }
+
+define ptr @memset_tailc(ptr %ret_val, i64 %sz) nounwind {
+; CHECK-LABEL: memset_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je LBB4_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    jmp _memset ## TAILCALL
+; CHECK-NEXT:  LBB4_1: ## %return
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq ptr %ret_val, null
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memset.p0.i64(ptr nonnull align 1 %ret_val, i8 0, i64 %sz, i1 false)
+  br label %return
+
+return:
+  ret ptr %ret_val
+}
+
+define ptr @memcpy_tailc(ptr %ret_val, i64 %sz, ptr %src) nounwind {
+; CHECK-LABEL: memcpy_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB5_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdx, %rsi
+; CHECK-NEXT:    movq %rax, %rdx
+; CHECK-NEXT:    jmp _memcpy ## TAILCALL
+; CHECK-NEXT:  LBB5_1: ## %return
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %ret_val, ptr align 1 %src, i64 %sz, i1 false)
+  br label %return
+
+return:
+  %phi = phi ptr [ %ret_val, %if.then ], [ %src, %entry ]
+  ret ptr %phi
+}
+
+define ptr @strcpy_legal_and_baz_illegal(ptr %arg, i64 %sz, ptr %2) nounwind {
+; CHECK-LABEL: strcpy_legal_and_baz_illegal:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movq %rsi, %r15
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq _malloc
+; CHECK-NEXT:    testq %r15, %r15
+; CHECK-NEXT:    je LBB6_1
+; CHECK-NEXT:  ## %bb.2: ## %if.then
+; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    jmp _strcpy ## TAILCALL
+; CHECK-NEXT:  LBB6_1: ## %if.else
+; CHECK-NEXT:    movq %r14, %rdi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:    movq %r14, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
+entry:
+  %strcpy_ret_val = tail call noalias ptr @malloc(i64 %sz)
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:
+  %rv_unused = tail call ptr @strcpy(ptr dereferenceable(1) %strcpy_ret_val, ptr dereferenceable(1) %2)
+  br label %return
+
+if.else:
+  %rv_unused_2 = tail call ptr @baz(ptr %arg, ptr %2)
+  br label %return
+
+return:
+  %phi = phi ptr [ %strcpy_ret_val, %if.then ], [ %arg, %if.else ]
+  ret ptr %phi
+}
+
+define ptr @baz_illegal_tailc(ptr %ret_val, ptr %arg) nounwind {
+; CHECK-LABEL: baz_illegal_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je LBB7_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:  LBB7_2: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq ptr %ret_val, null
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %rv = tail call ptr @baz(ptr %ret_val, ptr %arg)
+  br label %return
+
+return:
+  ret ptr %ret_val
+}
+
+define ptr @memset_illegal_tailc(ptr %arg, i64 %sz, ptr %ret_val_1, ptr %ret_val_2) nounwind {
+; CHECK-LABEL: memset_illegal_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB8_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    callq _memset
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:  LBB8_2: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  tail call void @llvm.memset.p0.i64(ptr align 1 %arg, i8 0, i64 %sz, i1 false)
+  br label %return
+
+return:
+  %phi = phi ptr [ %ret_val_2, %if.then ], [ %ret_val_1, %entry ]
+  ret ptr %phi
+}
+
+define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind {
+; CHECK-LABEL: strcpy_illegal_tailc:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB9_2
+; CHECK-NEXT:  ## %bb.1: ## %if.then
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    callq _strcpy
+; CHECK-NEXT:  LBB9_2: ## %return
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %cmp = icmp eq i64 %sz, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %6 = tail call ptr @strcpy(ptr dereferenceable(1) %dest, ptr dereferenceable(1) %src)
+  br label %return
+
+return:
+  ret ptr %src
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1)
+declare noalias ptr @malloc(i64)
+declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly)
+declare ptr @baz(ptr, ptr)
diff --git a/llvm/test/CodeGen/Xtensa/lit.local.cfg b/llvm/test/CodeGen/Xtensa/lit.local.cfg
new file mode 100644
index 000000000000..e81bfa773f36
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "Xtensa" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/Xtensa/simple.ll b/llvm/test/CodeGen/Xtensa/simple.ll
new file mode 100644
index 000000000000..a2c55aa51383
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/simple.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=xtensa -filetype=asm %s -o - | FileCheck %s
+; RUN: llc -mtriple=xtensa -filetype=obj %s -o - | llvm-objdump --arch=xtensa  -d - | FileCheck %s --check-prefix=DUMP
+
+; CHECK:  .text
+; DUMP:   file format elf32-xtensa
diff --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
index b3bdf966f887..b795ad1b61a9 100644
--- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
+++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
@@ -36,15 +36,15 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 define amdgpu_kernel void @kernel1(
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +0, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgN"
     i32 %ArgN,
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +8, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +4, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgA"
     ptr addrspace(1) %ArgA,
 ; CHECK: {{.*}}DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +16, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (DW_OP_fbreg +12, DW_OP_lit1, DW_OP_swap, DW_OP_xderef)
 ; CHECK-NEXT: DW_AT_name {{.*}}"ArgB"
     ptr addrspace(1) %ArgB) !dbg !13 {
 entry:
diff --git a/llvm/test/DebugInfo/COFF/swift.ll b/llvm/test/DebugInfo/COFF/swift.ll
index 81922cc4a374..710e12979519 100644
--- a/llvm/test/DebugInfo/COFF/swift.ll
+++ b/llvm/test/DebugInfo/COFF/swift.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -filetype=obj < %s | llvm-readobj --codeview - | FileCheck %s --check-prefix=OBJ
 
 ; ASM:      .short  4412                    # Record kind: S_COMPILE3
-; ASM-NEXT: .long   83                      # Flags and language
+; ASM-NEXT: .long   19                      # Flags and language
 ; ASM-NEXT: .short  208                     # CPUType
 
 ; OBJ-LABEL: Compile3Sym {
 ; OBJ-NEXT:    Kind: S_COMPILE3 (0x113C)
-; OBJ-NEXT:    Language: Swift (0x53)
+; OBJ-NEXT:    Language: Swift (0x13)
 ; OBJ-NEXT:    Flags [ (0x0)
 ; OBJ-NEXT:    ]
 ; OBJ-NEXT:    Machine: X64 (0xD0)
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
index 705484658280..8b226aa66330 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/codegenprepare/sunk-addr.ll
@@ -3,6 +3,11 @@
 ; RUN:   -mtriple=x86_64-unknown-unknown %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg."
 
+;; Test with RemoveDIs non-intrinsic debug-info too.
+; RUN: llc -start-before=codegenprepare -stop-after=codegenprepare \
+; RUN:   -mtriple=x86_64-unknown-unknown %s -o - --try-experimental-debuginfo-iterators \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg."
+
 ;; Check that when CodeGenPrepare moves an address computation to a block it's
 ;; used in its dbg.assign uses are updated.
 ;;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index c4b209de7701..f7f126cf3c51 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,7 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.assign
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll b/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll
new file mode 100644
index 000000000000..13ac88de5cde
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/ipsccp-remap-assign-id.ll
@@ -0,0 +1,59 @@
+; RUN: opt -passes=ipsccp %s -S -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=ipsccp %s -S -o - | FileCheck %s
+
+;; Check the dbg.assign DIAssignID operand gets remapped after cloning.
+
+; CHECK: %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID ![[ID1:[0-9]+]]
+; CHECK-NEXT: dbg.assign(metadata i1 undef, metadata !{{.*}}, metadata !DIExpression(), metadata ![[ID1]], metadata ptr %tmp, metadata !DIExpression())
+;
+; CHECK: %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID ![[ID2:[0-9]+]]
+; CHECK-NEXT: dbg.assign(metadata i1 undef, metadata !{{.*}}, metadata !DIExpression(), metadata ![[ID2]], metadata ptr %tmp, metadata !DIExpression())
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define void @inv_txfm_add_dct_dct_4x4_c() {
+entry:
+  call void @inv_txfm_add_c(ptr @dav1d_inv_dct4_1d_c)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+
+; Function Attrs: noinline
+define void @inv_txfm_add_c(ptr %first_1d_fn) #2 {
+entry:
+  %tmp = alloca [4096 x i32], i32 0, align 16, !DIAssignID !5
+  tail call void @llvm.dbg.assign(metadata i1 undef, metadata !6, metadata !DIExpression(), metadata !5, metadata ptr %tmp, metadata !DIExpression()), !dbg !16
+  call void @llvm.memset.p0.i64(ptr %tmp, i8 0, i64 0, i1 false), !DIAssignID !17
+  call void %first_1d_fn(ptr null, i64 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @dav1d_inv_dct4_1d_c(ptr, i64, i32, i32)
+
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+attributes #2 = { noinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, splitDebugInlining: false, nameTableKind: GNU)
+!1 = !DIFile(filename: "itx_tmpl.c", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!5 = distinct !DIAssignID()
+!6 = !DILocalVariable(name: "tmp", scope: !7, file: !1, line: 78, type: !10)
+!7 = distinct !DISubprogram(name: "inv_txfm_add_c", scope: !1, file: !1, line: 41, type: !8, scopeLine: 45, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!8 = distinct !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 131072, elements: !2)
+!11 = !DIDerivedType(tag: DW_TAG_typedef, name: "int32_t", file: !12, line: 26, baseType: !13)
+!12 = !DIFile(filename: "stdint-intn.h", directory: ".")
+!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "__int32_t", file: !14, line: 41, baseType: !15)
+!14 = !DIFile(filename: "types.h", directory: ".")
+!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!16 = !DILocation(line: 0, scope: !7)
+!17 = distinct !DIAssignID()
diff --git a/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll
new file mode 100644
index 000000000000..77c9aa5764fb
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dont-drop-dbg-assigns-in-isels.ll
@@ -0,0 +1,46 @@
+; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s
+; RUN: llc %s -fast-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - | FileCheck %s
+; RUN: llc %s -global-isel -start-after=codegenprepare -stop-before=finalize-isel -o - --try-experimental-debuginfo-iterators | FileCheck %s
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; CHECK: DBG_VALUE
+
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+define dso_local i32 @foo(i32 %a, i32 %b) local_unnamed_addr !dbg !8 {
+entry:
+  call void @llvm.dbg.assign(metadata !DIArgList(i32 %a, i32 %b), metadata !16, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), metadata !21, metadata ptr undef, metadata !DIExpression()), !dbg !17
+  %mul = mul nsw i32 %b, %a, !dbg !18
+  ret i32 %mul, !dbg !18
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !19, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "debug_value_list_selectiondag.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 11.0.0"}
+!8 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!9 = !DIFile(filename: ".\\debug_value_list.cpp", directory: "/tmp")
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12, !12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14, !15, !16}
+!14 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !9, line: 1, type: !12)
+!15 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !9, line: 1, type: !12)
+!16 = !DILocalVariable(name: "c", scope: !8, file: !9, line: 2, type: !12)
+!17 = !DILocation(line: 0, scope: !8)
+!18 = !DILocation(line: 3, scope: !8)
+!19 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!20 = !DILocalVariable(name: "d", scope: !8, file: !9, line: 2, type: !12)
+!21 = distinct !DIAssignID()
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
new file mode 100644
index 000000000000..ec8d0340de45
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
@@ -0,0 +1,60 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=hwasan -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_hwaddress
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
index d5e362bc800a..50bad1925026 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
@@ -1,5 +1,10 @@
 ; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
 
+;; Also test with RemoveDIs to verify that debug intrinsics immediately
+;; preceding an alloca (or other instruction of interest to stack tagging) will
+;; be correctly processed.
+; RUN: opt --try-experimental-debuginfo-iterators -passes=hwasan -S -o - %s | FileCheck %s
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android"
 
@@ -12,11 +17,11 @@ entry:
   %nodebug2 = alloca ptr
   %nodebug3 = alloca ptr
   %a = alloca ptr
-  %b = alloca ptr
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
   call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
   call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
+  %b = alloca ptr
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
   call void @llvm.dbg.declare(metadata ptr %b, metadata !13, metadata !DIExpression()), !dbg !14
   ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
diff --git a/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll b/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll
new file mode 100644
index 000000000000..29ebc0366040
--- /dev/null
+++ b/llvm/test/Instrumentation/PGOForceFunctionAttrs/basic.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=default | FileCheck %s --check-prefixes=NONE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=optsize | FileCheck %s --check-prefixes=OPTSIZE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=minsize | FileCheck %s --check-prefixes=MINSIZE,CHECK
+; RUN: opt < %s -passes=pgo-force-function-attrs -pgo-kind=pgo-instr-use-pipeline -S -pgo-cold-func-opt=optnone | FileCheck %s --check-prefixes=OPTNONE,CHECK
+
+; Should be no changes without profile data
+; RUN: opt < %s -passes=pgo-force-function-attrs                                  -S -pgo-cold-func-opt=minsize | FileCheck %s --check-prefixes=NONE,CHECK
+
+; NONE-NOT: Function Attrs:
+; OPTSIZE: Function Attrs: optsize{{$}}
+; MINSIZE: Function Attrs: minsize{{$}}
+; OPTNONE: Function Attrs: noinline optnone{{$}}
+; CHECK: define void @cold()
+
+; CHECK: Function Attrs: optsize{{$}}
+; CHECK-NEXT: define void @cold_optsize()
+
+; CHECK: Function Attrs: minsize{{$}}
+; CHECK-NEXT: define void @cold_minsize()
+
+; CHECK: Function Attrs: noinline optnone{{$}}
+; CHECK-NEXT: define void @cold_optnone()
+
+; NONE: Function Attrs: cold{{$}}
+; OPTSIZE: Function Attrs: cold optsize{{$}}
+; MINSIZE: Function Attrs: cold minsize{{$}}
+; OPTNONE: Function Attrs: cold noinline optnone{{$}}
+; CHECK-NEXT: define void @cold_attr()
+
+; CHECK-NOT: Function Attrs: {{.*}}optsize
+; CHECK-NOT: Function Attrs: {{.*}}minsize
+; CHECK-NOT: Function Attrs: {{.*}}optnone
+
+@s = global i32 0
+
+define void @cold() !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_optsize() optsize !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_minsize() minsize !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_optnone() noinline optnone !prof !27 {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @cold_attr() cold {
+  store i32 1, ptr @s, align 4
+  ret void
+}
+
+define void @hot() !prof !28 {
+  %l = load i32, ptr @s, align 4
+  %add = add nsw i32 %l, 4
+  store i32 %add, ptr @s, align 4
+  ret void
+}
+
+attributes #0 = { optsize }
+attributes #1 = { minsize }
+attributes #2 = { noinline optnone }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 9040}
+!4 = !{!"MaxCount", i64 9000}
+!5 = !{!"MaxInternalCount", i64 0}
+!6 = !{!"MaxFunctionCount", i64 9000}
+!7 = !{!"NumCounts", i64 5}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26}
+!11 = !{i32 10000, i64 9000, i32 1}
+!12 = !{i32 100000, i64 9000, i32 1}
+!13 = !{i32 200000, i64 9000, i32 1}
+!14 = !{i32 300000, i64 9000, i32 1}
+!15 = !{i32 400000, i64 9000, i32 1}
+!16 = !{i32 500000, i64 9000, i32 1}
+!17 = !{i32 600000, i64 9000, i32 1}
+!18 = !{i32 700000, i64 9000, i32 1}
+!19 = !{i32 800000, i64 9000, i32 1}
+!20 = !{i32 900000, i64 9000, i32 1}
+!21 = !{i32 950000, i64 9000, i32 1}
+!22 = !{i32 990000, i64 9000, i32 1}
+!23 = !{i32 999000, i64 10, i32 5}
+!24 = !{i32 999900, i64 10, i32 5}
+!25 = !{i32 999990, i64 10, i32 5}
+!26 = !{i32 999999, i64 10, i32 5}
+!27 = !{!"function_entry_count", i64 10}
+!28 = !{!"function_entry_count", i64 9000}
diff --git a/llvm/test/MC/AArch64/arm64-adr.s b/llvm/test/MC/AArch64/arm64-adr.s
index 131e545d3bb5..d1a91b6acf39 100644
--- a/llvm/test/MC/AArch64/arm64-adr.s
+++ b/llvm/test/MC/AArch64/arm64-adr.s
@@ -23,6 +23,16 @@ adrp x0, foo
 // CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
 // CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adrp_imm21
 
+// CHECK:      adrp    x0, lsl     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: lsl, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK-NEXT: adrp    x0, ror     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: ror, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK-NEXT: adr    x0, uxtb    // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: uxtb, kind: fixup_aarch64_pcrel_adr_imm21
+adrp x0, lsl
+adrp x0, ror
+adr x0, uxtb
+
 adr x0, #0xffffffff
 adrp x0, #0xffffffff
 adrp x0, #1
diff --git a/llvm/test/MC/AArch64/arm64-branch-encoding.s b/llvm/test/MC/AArch64/arm64-branch-encoding.s
index 48c2099012f6..17fdbd9e43fb 100644
--- a/llvm/test/MC/AArch64/arm64-branch-encoding.s
+++ b/llvm/test/MC/AArch64/arm64-branch-encoding.s
@@ -157,3 +157,13 @@ L1:
 ; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
 ; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
 
+;; Test "bad" names
+  bl lsl
+  b.eq lsr
+  b.ne uxth
+; CHECK:      bl lsl     ; encoding: [A,A,A,0b100101AA]
+; CHECK-NEXT:   fixup A - offset: 0, value: lsl, kind: fixup_aarch64_pcrel_call26
+; CHECK-NEXT: b.eq lsr   ; encoding: [0bAAA00000,A,A,0x54]
+; CHECK-NEXT:   fixup A - offset: 0, value: lsr, kind: fixup_aarch64_pcrel_branch19
+; CHECK-NEXT: b.ne uxth  ; encoding: [0bAAA00001,A,A,0x54]
+; CHECK-NEXT:   fixup A - offset: 0, value: uxth, kind: fixup_aarch64_pcrel_branch19
diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
index a49ed14d8677..4d96409067a2 100644
--- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s
+++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
@@ -15,6 +15,7 @@
 // RUN: llvm-mc -triple aarch64 -mattr=+v8r,+dotprod -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
+// RUN: llvm-mc -triple aarch64 -mcpu=ampere1b -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 
 // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
@@ -42,6 +43,8 @@
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 // RUN: not llvm-mc -triple aarch64 -mcpu=ampere1a -mattr=-dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
+// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1b -mattr=-dotprod -show-encoding < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 
 udot v0.2s, v1.8b, v2.8b
 sdot v0.2s, v1.8b, v2.8b
diff --git a/llvm/test/MC/AArch64/basic-a64-diagnostics.s b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
index a59861e13472..028700f4d11d 100644
--- a/llvm/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1149,6 +1149,10 @@
 // CHECK-ERROR-NEXT:           cbz x29, #1
 // CHECK-ERROR-NEXT:                    ^
 
+/// Test "bad" names
+cbz w1, lsl
+// CHECK-ERROR: [[#@LINE-1]]:12: error: expected #imm after shift specifier
+
 //------------------------------------------------------------------------------
 // Conditional branch (immediate)
 //------------------------------------------------------------------------------
@@ -1343,6 +1347,7 @@
         csel sp, x2, x3, ne
         csel x10, x11, sp, ge
         csel x1, x2, x3, #3
+        csel x1, x2, x3, lsl #1, eq
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        csel w4, wsp, w9, eq
 // CHECK-ERROR-NEXT:                 ^
@@ -1367,6 +1372,9 @@
 // CHECK-ERROR-NEXT: error: expected AArch64 condition code
 // CHECK-ERROR-NEXT:        csel x1, x2, x3, #3
 // CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected AArch64 condition code
+// CHECK-ERROR-NEXT:        csel x1, x2, x3, lsl #1, eq
+// CHECK-ERROR-NEXT:                         ^
 
         csinc w20, w21, wsp, mi
         csinc sp, x30, x29, eq
diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s
index 6be33afb5671..0bd90266457e 100644
--- a/llvm/test/MC/AMDGPU/gfx11-promotions.s
+++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s
@@ -11,343 +11,343 @@
 //===----------------------------------------------------------------------===//
 
 v_mov_b32 v0, v1
-// GFX11: encoding: [0x01,0x03,0x00,0x7e]
+// GFX11: v_mov_b32_e32 v0, v1                    ; encoding: [0x01,0x03,0x00,0x7e]
 
 v_mov_b32_e32 v0, v1
-// GFX11: encoding: [0x01,0x03,0x00,0x7e]
+// GFX11: v_mov_b32_e32 v0, v1                    ; encoding: [0x01,0x03,0x00,0x7e]
 
 //===----------------------------------------------------------------------===//
 // VOP2.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x64]
+// GFX11: v_add_f16_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x64]
 
 v_add_f16_e32 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x64]
+// GFX11: v_add_f16_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x64]
 
 //===----------------------------------------------------------------------===//
 // VOPC.
 //===----------------------------------------------------------------------===//
 
 v_cmp_lt_f32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x22,0x7c]
+// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
 
 v_cmp_lt_f32_e32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x22,0x7c]
+// GFX11: v_cmp_lt_f32_e32 vcc_lo, v1, v2         ; encoding: [0x01,0x05,0x22,0x7c]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2
-// GFX11: encoding: [0x01,0x05,0xfa,0x7d]
+// GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
 v_cmpx_class_f16_e32 v1, v2
-// GFX11: encoding: [0x01,0x05,0xfa,0x7d]
+// GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
 
 //===----------------------------------------------------------------------===//
 // VOP1.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP1.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_bfrev_b32 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP2.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
 
 v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP2.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
 
 v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPC.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmp_le_u16 v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1
-// GFX11: encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
+// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
 
 v_cmp_le_u16_dpp v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1
-// GFX11: encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
+// GFX11: v_cmp_le_u16 vcc_lo, v1, v2 dpp8:[7,7,7,3,4,4,6,7] fi:1 ; encoding: [0xea,0x04,0x76,0x7c,0x01,0xff,0x47,0xfa]
 
 //===----------------------------------------------------------------------===//
 // VOPC.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmp_gt_u16 v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
+// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
 
 v_cmp_gt_u16_dpp v1, v2 row_shl:0x7 row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
+// GFX11: v_cmp_gt_u16 vcc_lo, v1, v2 row_shl:7 row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x78,0x7c,0x01,0x07,0x05,0x00]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPCX.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
 v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, 0.5 mul:2
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+// GFX11: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
 
 v_sin_f32_e64 v5, 0.5 mul:2
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+// GFX11: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
 
 v_sin_f32_e64 v5, v1
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
+// GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
 
 v_sin_f32 v5, v1
-// GFX11: encoding: [0x01,0x6b,0x0a,0x7e]
+// GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, -v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
+// GFX11: v_add_f32_e64 v5, v1, -v2               ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
 
 v_add_f32_e64 v5, v1, -v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
+// GFX11: v_add_f32_e64 v5, v1, -v2               ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x40]
 
 v_add_f32_e64 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x00]
+// GFX11: v_add_f32_e64 v5, v1, v2                ; encoding: [0x05,0x00,0x03,0xd5,0x01,0x05,0x02,0x00]
 
 v_add_f32 v5, v1, v2
-// GFX11: encoding: [0x01,0x05,0x0a,0x06]
+// GFX11: v_add_f32_e32 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x06]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_cmp_f_f32 s10, -v1, v2
-// GFX11: encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmp_f_f32_e64 s10, -v1, v2            ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmp_f_f32_e64 s10, -v1, v2
-// GFX11: encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmp_f_f32_e64 s10, -v1, v2            ; encoding: [0x0a,0x00,0x10,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmp_f_f32_e64 vcc_lo, v1, v2
-// GFX11: encoding: [0x6a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
+// GFX11: v_cmp_f_f32_e64 vcc_lo, v1, v2          ; encoding: [0x6a,0x00,0x10,0xd4,0x01,0x05,0x02,0x00]
 
 v_cmp_f_f32 vcc_lo, v1, v2
-// GFX11: encoding: [0x01,0x05,0x20,0x7c]
+// GFX11: v_cmp_f_f32_e32 vcc_lo, v1, v2          ; encoding: [0x01,0x05,0x20,0x7c]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_f_f32 -v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmpx_f_f32_e64 -v1, v2                ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmpx_f_f32_e64 -v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
+// GFX11: v_cmpx_f_f32_e64 -v1, v2                ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x20]
 
 v_cmpx_f_f32_e64 v1, v2
-// GFX11: encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
+// GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 
 v_cmpx_f_f32 v1, v2
-// GFX11: encoding: [0x01,0x05,0x20,0x7d]
+// GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
 
 //===----------------------------------------------------------------------===//
 // VOP3.
 //===----------------------------------------------------------------------===//
 
 v_add3_u32 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
+// GFX11: v_add3_u32 v5, v1, v2, s3               ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
 
 v_add3_u32_e64 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
+// GFX11: v_add3_u32 v5, v1, v2, s3               ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x18,0x01,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
 v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
 v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05]
+// GFX11: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmp_class_f32 s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
+// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7c,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f32 -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
+// GFX11: v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfc,0x7d,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP1 -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_sin_f32 v5, v1 div:2 row_xmask:15
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
 
 v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 div:2 row_xmask:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x18,0x01,0x6f,0x01,0xff]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP2 -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_add_f32 v5, v1, v2 div:2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
 
 v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 div:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x18,0x01,0x1b,0x00,0xff]
 
 v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_add_f32 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff]
+// GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPC -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmp_class_f32 s5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX11: v_cmp_class_f32_e64_dpp vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x6a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
+// GFX11: v_cmp_class_f32 vcc_lo, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7c,0x01,0x50,0x01,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOPCX -> VOP3.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
+// GFX11: v_cmpx_class_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfc,0x7d,0x01,0x1b,0x00,0xff]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3
-// GFX11: encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3           ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
 
 v_dot2_f32_f16_e64 v0, v1, v2, v3
-// GFX11: encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3           ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP8.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
new file mode 100644
index 000000000000..1871a41ec598
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
@@ -0,0 +1,85 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shl:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shl:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shr:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_shr:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_ror:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_ror:15
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shl:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shl:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shr:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_shr:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_ror:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_ror:15
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index 9a65c6687f3f..701a72597e45 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
 v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -42,46 +42,52 @@ v_bfrev_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+v_ceil_f16_e64_dpp v5.l, v1.h quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5.h, v1.l quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x40,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_mirror
+v_ceil_f16_e64_dpp v5.l, v1.l row_mirror
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shl:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shl:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shr:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_shr:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_ror:1
+v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_ror:15
+v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
@@ -1512,46 +1518,46 @@ v_ffbl_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi
 v_ffbl_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_mirror
+v_floor_f16_e64_dpp v5.l, v1.l row_mirror
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_half_mirror
+v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shl:1
+v_floor_f16_e64_dpp v5.l, v1.l row_shl:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shl:15
+v_floor_f16_e64_dpp v5.l, v1.l row_shl:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shr:1
+v_floor_f16_e64_dpp v5.l, v1.l row_shr:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_shr:15
+v_floor_f16_e64_dpp v5.l, v1.l row_shr:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_ror:1
+v_floor_f16_e64_dpp v5.l, v1.l row_ror:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_ror:15
+v_floor_f16_e64_dpp v5.l, v1.l row_ror:15
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX11: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s
new file mode 100644
index 000000000000..1bef1fe215ac
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 3897b82785f6..043e0f9334ad 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -9,16 +9,22 @@ v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5.h, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x40,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0xdc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x81,0xdc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
@@ -375,16 +381,16 @@ v_ffbl_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ffbl_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: [0x05,0x00,0xdb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x81,0xdb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index f6c7c99847d6..73d04f18e860 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -270,9 +270,6 @@ s_cbranch_execnz 0x0
 s_cbranch_execnz 0x1234
 // GFX12: s_cbranch_execnz 4660 ; encoding: [0x34,0x12,0xa6,0xbf]
 
-s_barrier
-// GFX12: s_barrier ; encoding: [0x00,0x00,0xbd,0xbf]
-
 s_setkill 0x0
 // GFX12: s_setkill 0 ; encoding: [0x00,0x00,0x81,0xbf]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_unsupported.s b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
index 9d52a5dcb1a0..6cd0fe2ad008 100644
--- a/llvm/test/MC/AMDGPU/gfx12_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
@@ -106,6 +106,9 @@ s_cmpk_le_u32 s0, 0
 s_inst_prefetch 1
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_barrier
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 buffer_atomic_cmpswap_f32 v[5:6], off, s[96:99], s3
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/BPF/insn-unit.s b/llvm/test/MC/BPF/insn-unit.s
index 58342cda7cc0..224eb7381aa2 100644
--- a/llvm/test/MC/BPF/insn-unit.s
+++ b/llvm/test/MC/BPF/insn-unit.s
@@ -61,6 +61,9 @@
 // CHECK-32: c3 92 10 00 00 00 00 00 	lock *(u32 *)(r2 + 16) += w9
 // CHECK: db a3 e2 ff 00 00 00 00 	lock *(u64 *)(r3 - 30) += r10
 
+  callx r2
+// CHECK: 8d 02 00 00 00 00 00 00 	callx r2
+
 // ======== BPF_JMP Class ========
   if r1 & r2 goto Llabel0    // BPF_JSET  | BPF_X
   if r1 & 0xffff goto Llabel0    // BPF_JSET  | BPF_K
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
index 907d0c319efd..259cb9dbc52a 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt
@@ -14,6 +14,7 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1a --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1b --disassemble < %s | FileCheck %s
 
 # CHECK: ldaprb w0, [x0]
 # CHECK: ldaprh w0, [x0]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index cf29efa5ff56..fe5084539aed 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
@@ -42,48 +43,74 @@
 # GFX11: v_bfrev_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_ceil_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xa2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
@@ -1302,48 +1329,74 @@
 # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xa5,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
 
-# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 0xff,0x81,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
 
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+
 # GFX11: v_floor_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0xa4,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index bfda6d10c2f6..c1b500e495fe 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
@@ -6,18 +7,34 @@
 # GFX11: v_bfrev_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xb8,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_ceil_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdc,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_ceil_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
+# GFX11-REAL16: v_ceil_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdc,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+
 # GFX11: v_ceil_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
@@ -288,18 +305,34 @@
 # GFX11: v_exp_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xa5,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x48,0xdb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_floor_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0x05,0x08,0xdb,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_floor_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 
+# GFX11-REAL16: v_floor_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xff,0xc1,0xdb,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+
 # GFX11: v_floor_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa4,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index ea547fcd5d0e..e9371d14864a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -78,9 +78,6 @@
 # GFX12: s_barrier_leave                         ; encoding: [0x00,0x00,0x95,0xbf]
 0x00,0x00,0x95,0xbf
 
-# GFX12: s_barrier                               ; encoding: [0x00,0x00,0xbd,0xbf]
-0x00,0x00,0xbd,0xbf
-
 # GFX12: s_branch 0                              ; encoding: [0x00,0x00,0xa0,0xbf]
 0x00,0x00,0xa0,0xbf
 
diff --git a/llvm/test/MC/WebAssembly/alias-offset.s b/llvm/test/MC/WebAssembly/alias-offset.s
index e45b17d074f6..4899922c8d10 100644
--- a/llvm/test/MC/WebAssembly/alias-offset.s
+++ b/llvm/test/MC/WebAssembly/alias-offset.s
@@ -12,10 +12,10 @@ sym_a:
 .set sym_b, sym_a + 4
 
 # CHECK-LABEL: SYMBOL TABLE:
-# CHECK-NEXT: 00000000 l     O DATA foo
-# CHECK-NEXT: 00000004 l     O DATA sym_a
-# CHECK-NEXT: 00000008 l     O DATA sym_b
-# CHECK-NEXT: 00000001 l     F CODE main
+# CHECK-NEXT: 00000000 l     O DATA 00000004 foo
+# CHECK-NEXT: 00000004 l     O DATA 00000008 sym_a
+# CHECK-NEXT: 00000008 l     O DATA 00000004 sym_b
+# CHECK-NEXT: 00000001 l     F CODE 00000012 main
 
   .text
   .section    .text,"",@
diff --git a/llvm/test/MC/WebAssembly/alias.s b/llvm/test/MC/WebAssembly/alias.s
index b0a75399b7ad..8ed46f5fdf33 100644
--- a/llvm/test/MC/WebAssembly/alias.s
+++ b/llvm/test/MC/WebAssembly/alias.s
@@ -10,6 +10,6 @@ sym_a:
 
 .set sym_b, sym_a
 
-# CHECK: 00000000 l     O DATA foo
-# CHECK: 00000004 l     O DATA sym_a
-# CHECK: 00000004 l     O DATA sym_b
+# CHECK: 00000000 l     O DATA 00000004 foo
+# CHECK: 00000004 l     O DATA 00000004 sym_a
+# CHECK: 00000004 l     O DATA 00000004 sym_b
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 7fb33ca662b1..4c2b4479a4aa 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -238,6 +238,23 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1201 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1201 %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX9_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_1_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_1_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_1_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_1_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_3_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_3_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_3_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_3_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX10_3_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_3_GENERIC %s
+
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX11_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX11_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX11_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX11_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX11_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX11_GENERIC %s
+
+
 # ELF-R600-ALL:       Format: elf32-amdgpu
 # ELF-R600-ALL:       Arch: r600
 # ELF-R600-ALL:       AddressSize: 32bit
@@ -435,6 +452,18 @@
 # ELF-AMDGCN-GFX1201:   EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
 # YAML-AMDGCN-GFX1201:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
 
+# ELF-AMDGCN-GFX9_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+# YAML-AMDGCN-GFX9_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ]
+
+# ELF-AMDGCN-GFX10_1_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
+# YAML-AMDGCN-GFX10_1_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC ]
+
+# ELF-AMDGCN-GFX10_3_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
+# YAML-AMDGCN-GFX10_3_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC ]
+
+# ELF-AMDGCN-GFX11_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
+# YAML-AMDGCN-GFX11_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC ]
+
 # ELF-AMDGCN-ALL:       ]
 
 
diff --git a/llvm/test/Object/wasm-bad-relocation.yaml b/llvm/test/Object/wasm-bad-relocation.yaml
new file mode 100644
index 000000000000..aed405c328b6
--- /dev/null
+++ b/llvm/test/Object/wasm-bad-relocation.yaml
@@ -0,0 +1,35 @@
+# RUN: yaml2obj %s | not llvm-objdump -s - 2>&1 | FileCheck %s
+
+# Check for invalid relocations.  In this case we have a relocations of type
+# R_WASM_FUNCTION_INDEX_LEB against a symbol (foo) which is not a function
+# symbol but a data symbol.
+
+# CHECK: invalid function relocation: foo
+
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         '6401020304'
+    Relocations:
+      - Type:            R_WASM_FUNCTION_INDEX_LEB
+        Index:           0
+        Offset:          0x00000000
+  - Type:            CUSTOM
+    Name:            linking
+    Version:         2
+    SymbolTable:
+      - Index:           0
+        Kind:            DATA
+        Name:            foo
+        Flags:           [ ]
+        Segment:         0
+        Offset:          0
+        Size:            1
diff --git a/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
new file mode 100644
index 000000000000..5dfa3945c8d9
--- /dev/null
+++ b/llvm/test/Object/wasm-linked-namesec-with-linkingsec.yaml
@@ -0,0 +1,40 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-nm -P %t.wasm | FileCheck %s
+#
+# Test that names from the linking section override those from the name section
+# CHECK:  foo T 1 3
+# CHECK-NOT: my_func_local_name
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:
+        Body:            00
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            CUSTOM
+    Name:            linking
+    Version:         2
+    SymbolTable:
+      - Index:           0
+        Kind:            FUNCTION
+        Name:            foo
+        Flags:           [ VISIBILITY_HIDDEN ]
+        Function:        0
+  - Type:            CUSTOM
+    Name:            name
+    FunctionNames:
+     - Index:        1
+       Name:         my_func_local_name
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml b/llvm/test/Object/wasm-linked-symbol-table.yaml
index 6dd949a44149..eccdc2c611a7 100644
--- a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table.yaml
+++ b/llvm/test/Object/wasm-linked-symbol-table.yaml
@@ -2,9 +2,9 @@
 # RUN: llvm-objdump -t %t.wasm | FileCheck %s
 #
 # CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 0000009f g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
-# CHECK-NEXT: 00000000 g   TABLE my_table_export
+# CHECK-NEXT: 0000009f g F CODE 00000003 my_func_export
+# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export
+# CHECK-NEXT: 00000000 g   TABLE 00000000 my_table_export
 
 --- !WASM
 FileHeader:
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ecdb5a5e010d..51fb93daa4df 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -72,6 +72,10 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
+; RUN:     -passes='default<O3>' -enable-jump-table-to-switch -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext
+
+; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
 
@@ -151,6 +155,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
index cf57a247bc79..622d1df7b381 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
@@ -26,13 +26,13 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   ReplaceTemp
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(181), /*)*//*default:*//*Label 2*/ GIMT_Encode4(558),
-// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(474), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(526),
-// CHECK-NEXT:     // Label 0: @474
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(525), // Rule ID 1 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(182), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562),
+// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530),
+// CHECK-NEXT:     // Label 0: @478
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(529), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       // MIs[0] a
@@ -57,10 +57,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceRegWithTempReg, /*OldInsnID*/0, /*OldOpIdx*/1, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 3: @525
+// CHECK-NEXT:     // Label 3: @529
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @526
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(557), // Rule ID 0 //
+// CHECK-NEXT:     // Label 1: @530
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(561), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -75,10 +75,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @557
+// CHECK-NEXT:     // Label 4: @561
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @558
+// CHECK-NEXT:     // Label 2: @562
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 559 bytes
+// CHECK-NEXT:     }; // Size: 563 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
index d0c0eba9e397..f0ca65a87b76 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
@@ -32,14 +32,14 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   CImmInstTest1
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(127), /*)*//*default:*//*Label 3*/ GIMT_Encode4(559),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(442), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(473), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(519),
-// CHECK-NEXT:     // Label 0: @442
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(472), // Rule ID 0 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 3*/ GIMT_Encode4(563),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(477), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(523),
+// CHECK-NEXT:     // Label 0: @446
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(476), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @472
+// CHECK-NEXT:     // Label 4: @476
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @473
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(518), // Rule ID 2 //
+// CHECK-NEXT:     // Label 1: @477
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(522), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 5: @518
+// CHECK-NEXT:     // Label 5: @522
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @519
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(558), // Rule ID 1 //
+// CHECK-NEXT:     // Label 2: @523
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(562), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 6: @558
+// CHECK-NEXT:     // Label 6: @562
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @559
+// CHECK-NEXT:     // Label 3: @563
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 560 bytes
+// CHECK-NEXT:     }; // Size: 564 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
index 94cc3e58dfc9..a446fb72298c 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
@@ -27,10 +27,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   SpecialIntrins
 ]>;
 
-
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(114), GIMT_Encode2(116), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(115), GIMT_Encode2(117), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132),
 // CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18),
 // CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73),
 // CHECK-NEXT:     // Label 0: @18
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
index fdcb31e96797..d3c202c4cb01 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
@@ -26,9 +26,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   Test0
 ]>;
 
-// CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
+//      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(182), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(120), GIMT_Encode2(183), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380),
 // CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:     /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344),
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 5ec44b5e08d8..57ad0009b5bd 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -132,15 +132,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // Verify match table.
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(127), /*)*//*default:*//*Label 6*/ GIMT_Encode4(657),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(442), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(484), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(537), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(579), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(604), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(617),
-// CHECK-NEXT:     // Label 0: @442
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(471), // Rule ID 4 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 6*/ GIMT_Encode4(661),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(541), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(583), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(608), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(621),
+// CHECK-NEXT:     // Label 0: @446
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(475), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -155,8 +155,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 7: @471
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(483), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @475
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(487), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 8: @483
+// CHECK-NEXT:     // Label 8: @487
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @484
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(536), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @488
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(540), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -186,10 +186,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 9: @536
+// CHECK-NEXT:     // Label 9: @540
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @537
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(578), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @541
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(582), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -207,32 +207,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 10: @578
+// CHECK-NEXT:     // Label 10: @582
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @579
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(591), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @583
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(595), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 11: @591
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(603), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @595
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(607), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 12: @603
+// CHECK-NEXT:     // Label 12: @607
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @604
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(616), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @608
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(620), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 13: @616
+// CHECK-NEXT:     // Label 13: @620
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @617
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(656), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @621
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -247,10 +247,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 14: @656
+// CHECK-NEXT:     // Label 14: @660
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 6: @657
+// CHECK-NEXT:     // Label 6: @661
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 658 bytes
+// CHECK-NEXT:     }; // Size: 662 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 3e651265b352..f79b792b37a3 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -518,7 +518,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 2019 bytes
+// R00O-NEXT:  }; // Size: 2023 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
index ee8ce72c3132..6e57cc069358 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 ;CHECK-LABEL: @test_eq_eq
 
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
index 78b8f5b2793e..256261d0dd11 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-linaro-linux-gnueabi"
+target triple = "aarch64"
 
 %struct.bitmap = type { i32, ptr }
 
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll b/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
index d9739afe3a13..150c56ca8450 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-arguments.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 
 ; This test checks that we propagate the functions through arguments and attach
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll b/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
index 899bed8fc500..106ed849385c 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-memory.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 @global_function = internal unnamed_addr global ptr null, align 8
 @global_array = common unnamed_addr global ptr null, align 8
diff --git a/llvm/test/Transforms/CalledValuePropagation/simple-select.ll b/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
index e4d35b8169e8..b2ea20a9449e 100644
--- a/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
+++ b/llvm/test/Transforms/CalledValuePropagation/simple-select.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -passes=called-value-propagation -S < %s | FileCheck %s
 
-target triple = "aarch64-unknown-linux-gnueabi"
+target triple = "aarch64"
 
 @global_function = internal unnamed_addr global ptr null, align 8
 @global_scalar = internal unnamed_addr global i64 zeroinitializer
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
index da0b2e68313c..f74dc3e5032a 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-gep.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -passes=consthoist -consthoist-gep -S -o - %s | FileCheck %s
 
-target triple = "aarch64-none--musleabi"
+target triple = "aarch64"
 
 %0 = type { %1, %2, [9 x i16], %6, %7 }
 %1 = type { i32, i32, i32, i32, i32, i32, i16, i16, i8, i8, i16, i32, i32, i16, i8, i8 }
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 93ed008022e2..8dce9ef9fa43 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -442,7 +442,7 @@ define i32 @switch_range(i32 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
 ; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
-; CHECK-NEXT:    switch i32 [[S1]], label [[DEFAULT_UNREACHABLE:%.*]] [
+; CHECK-NEXT:    switch i32 [[S1]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[EXIT1:%.*]]
 ; CHECK-NEXT:      i32 2, label [[EXIT2:%.*]]
 ; CHECK-NEXT:      i32 3, label [[EXIT1]]
@@ -451,8 +451,6 @@ define i32 @switch_range(i32 %cond) {
 ; CHECK-NEXT:    ret i32 1
 ; CHECK:       exit2:
 ; CHECK-NEXT:    ret i32 2
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
 ; CHECK:       unreachable:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -515,9 +513,10 @@ define i8 @switch_defaultdest_multipleuse(i8 %t0) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[O:%.*]] = or i8 [[T0:%.*]], 1
 ; CHECK-NEXT:    [[R:%.*]] = srem i8 1, [[O]]
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    switch i8 [[R]], label [[EXIT:%.*]] [
+; CHECK-NEXT:      i8 0, label [[EXIT]]
+; CHECK-NEXT:      i8 1, label [[EXIT]]
+; CHECK-NEXT:    ]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i8 0
 ;
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll b/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
deleted file mode 100644
index a0794d5efe93..000000000000
--- a/llvm/test/Transforms/CorrelatedValuePropagation/switch.ll
+++ /dev/null
@@ -1,301 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
-
-define i32 @test_unreachable_default(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT:%.*]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_unreachable_default_shared_edge(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default_shared_edge(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       default.unreachable:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call4(i32 [[SUB]])
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.bb4 [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %val = phi i32 [ %sub, %entry ], [ %sub, %entry ]
-  %call5 = call i32 @call4(i32 %val)
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-; Negative tests
-
-define i32 @test_reachable_default(i32 noundef %num) {
-; CHECK-LABEL: define i32 @test_reachable_default(
-; CHECK-SAME: i32 noundef [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 4
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 4
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_unreachable_default_cond_may_be_undef(i32 %num) {
-; CHECK-LABEL: define i32 @test_unreachable_default_cond_may_be_undef(
-; CHECK-SAME: i32 [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    [[CALL6:%.*]] = call i32 @call3()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL6]], [[SW_DEFAULT]] ], [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  %call6 = call i32 @call3()
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %call6, %sw.default ], [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-define i32 @test_default_is_already_unreachable(i32 %num) {
-; CHECK-LABEL: define i32 @test_default_is_already_unreachable(
-; CHECK-SAME: i32 [[NUM:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[NUM]], -120
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], 3
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 2
-; CHECK-NEXT:    switch i32 [[COND]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:      i32 1, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB4:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @call0()
-; CHECK-NEXT:    br label [[CLEANUP:%.*]]
-; CHECK:       sw.bb2:
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @call1()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.bb4:
-; CHECK-NEXT:    [[CALL5:%.*]] = call i32 @call2()
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    unreachable
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CALL5]], [[SW_BB4]] ], [ [[CALL3]], [[SW_BB2]] ], [ [[CALL]], [[SW_BB]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
-;
-entry:
-  %sub = add i32 %num, -120
-  %cmp = icmp ult i32 %sub, 3
-  %cond = select i1 %cmp, i32 %sub, i32 2
-  switch i32 %cond, label %sw.default [
-  i32 0, label %sw.bb
-  i32 1, label %sw.bb2
-  i32 2, label %sw.bb4
-  ]
-
-sw.bb:
-  %call = call i32 @call0()
-  br label %cleanup
-
-sw.bb2:
-  %call3 = call i32 @call1()
-  br label %cleanup
-
-sw.bb4:
-  %call5 = call i32 @call2()
-  br label %cleanup
-
-sw.default:
-  unreachable
-
-cleanup:
-  %retval.0 = phi i32 [ %call5, %sw.bb4 ], [ %call3, %sw.bb2 ], [ %call, %sw.bb ]
-  ret i32 %retval.0
-}
-
-declare i32 @call0()
-declare i32 @call1()
-declare i32 @call2()
-declare i32 @call3()
-declare i32 @call4(i32)
diff --git a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
index 18dc038fce66..5d6cc7db5a41 100644
--- a/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
+++ b/llvm/test/Transforms/GlobalOpt/localize-constexpr-debuginfo.ll
@@ -1,4 +1,9 @@
-; RUN: opt -S < %s -passes=globalopt | FileCheck %s
+; RUN: opt -S < %s -passes=globalopt --experimental-debuginfo-iterators=false | FileCheck %s
+;; FIXME: this test is pinned to not use RemoveDIs non-intrinsic debug-info.
+;; Constant-deletion takes a slightly different path and (correctly) replaces
+;; the operand of the debug-info record with poison instead of a null pointer.
+;; This is a spurious test difference that we'll suppress for turning RemoveDIs
+;; on.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
index 6efe86d75147..b7d070045ea6 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll
@@ -11,7 +11,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    invoke void @fn1(i64 [[INDVARS_IV]])
-; CHECK-NEXT:    to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ [[INDVARS1]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller
@@ -21,7 +21,7 @@ define i32 @fn2() personality ptr @__CxxFrameHandler3 {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 [[C_0_LCSSA]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 8aa698a4cb51..7409fc8db0ca 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -148,7 +148,7 @@ define void @guardedloop(ptr %matrix, ptr %vector,
 ; CHECK-NEXT:    [[VECTORP:%.*]] = getelementptr inbounds [0 x double], ptr [[VECTOR:%.*]], i32 0, i64 [[INDVARS_IV2]]
 ; CHECK-NEXT:    [[V2:%.*]] = load double, ptr [[VECTORP]], align 8
 ; CHECK-NEXT:    call void @use(double [[V2]])
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[RETURN_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
index b45f0946399f..6a2bbfa5447a 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr30806-phi-scev.ll
@@ -43,7 +43,7 @@ define void @foo(ptr %buf, i32 %denominator, ptr %flag) local_unnamed_addr {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_LR_PH]] ]
 ; CHECK-NEXT:    [[BUF_ADDR_07:%.*]] = phi ptr [ [[BUF]], [[WHILE_BODY_LR_PH]] ], [ [[CALL:%.*]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[DIV]] to i64
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP2]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @theSize, align 4
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[I]], align 4
 ; CHECK-NEXT:    call void @bar(ptr nonnull [[I]], i64 [[INDVARS_IV_NEXT]])
diff --git a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
index 9c2237cff837..080bc9b42bbe 100644
--- a/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
+++ b/llvm/test/Transforms/IndVarSimplify/preserve-nsw-during-expansion.ll
@@ -23,7 +23,7 @@ define void @test_s172(i32 noundef %xa, i32 noundef %xb, ptr nocapture noundef %
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
index 17ce13d83487..35e6ca6c2cde 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-i32-i8ptr.ll
@@ -15,7 +15,7 @@ define dso_local void @Widen_i32_i8ptr() local_unnamed_addr {
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[GID_0]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX2115:%.*]] = getelementptr inbounds [15 x ptr], ptr [[PTRIDS]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store ptr [[GID_0]], ptr [[ARRAYIDX2115]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND2106]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
new file mode 100644
index 000000000000..8d3e24c798df
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes=inline -inline-threshold=21 | FileCheck %s
+
+; Check for scenarios without TTI.
+
+define i64 @foo1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo1(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       default_branch.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
+; LOOKUPTABLE:       bar1.exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @foo1(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       default_branch.i:
+; SWITCH-NEXT:    br label [[BAR1_EXIT]]
+; SWITCH:       bar1.exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @foo1(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  %b = call i64 @bar1(i64 %a)
+  ret i64 %b
+}
+
+define i64 @foo2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo2(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       unreachabledefault.i:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       bar2.exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @foo2(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       unreachabledefault.i:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       bar2.exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @foo2(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT:%.*]]
+; CHECK:       branch_2.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       branch_4.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       branch_6.i:
+; CHECK-NEXT:    br label [[BAR2_EXIT]]
+; CHECK:       unreachabledefault.i:
+; CHECK-NEXT:    unreachable
+; CHECK:       bar2.exit:
+; CHECK-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; CHECK-NEXT:    ret i64 [[B_I]]
+;
+  %b = call i64 @bar2(i64 %a)
+  ret i64 %b
+}
+
+define i64 @bar1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar1(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       default_branch:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @bar1(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       default_branch:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @bar1(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       branch_2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_4:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_6:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default_branch:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %default_branch [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+define i64 @bar2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar2(
+; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       unreachabledefault:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
+;
+; SWITCH-LABEL: define i64 @bar2(
+; SWITCH-SAME: i64 [[TMP0:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       unreachabledefault:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; SWITCH-NEXT:    ret i64 [[TMP2]]
+;
+; CHECK-LABEL: define i64 @bar2(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; CHECK-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; CHECK-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; CHECK-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       branch_0:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       branch_2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_4:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       branch_6:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       unreachabledefault:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %unreachabledefault [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ]
+  ret i64 %b
+}
diff --git a/llvm/test/Transforms/Inline/inline-switch-default.ll b/llvm/test/Transforms/Inline/inline-switch-default.ll
new file mode 100644
index 000000000000..44f1304e82df
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-switch-default.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes=inline -inline-threshold=26 -min-jump-table-entries=4 | FileCheck %s -check-prefix=LOOKUPTABLE
+; RUN: opt %s -S -passes=inline -inline-threshold=21 -min-jump-table-entries=5 | FileCheck %s -check-prefix=SWITCH
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The `bar1` should not be inlined since there is a default branch.
+
+define i64 @foo1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo1(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @foo1(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  %b = call i64 @bar1(i64 %a)
+  ret i64 %b
+}
+
+define i64 @foo2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @foo2(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT:%.*]]
+; LOOKUPTABLE:       branch_2.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_4.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       branch_6.i:
+; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
+; LOOKUPTABLE:       unreachabledefault.i:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       bar2.exit:
+; LOOKUPTABLE-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B_I]]
+;
+; SWITCH-LABEL: define i64 @foo2(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
+; SWITCH:       branch_2.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_4.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       branch_6.i:
+; SWITCH-NEXT:    br label [[BAR2_EXIT]]
+; SWITCH:       unreachabledefault.i:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       bar2.exit:
+; SWITCH-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; SWITCH-NEXT:    ret i64 [[B_I]]
+;
+  %b = call i64 @bar2(i64 %a)
+  ret i64 %b
+}
+
+define i64 @bar1(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar1(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       default_branch:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @bar1(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       default_branch:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %default_branch [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+define i64 @bar2(i64 %a) {
+; LOOKUPTABLE-LABEL: define i64 @bar2(
+; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
+; LOOKUPTABLE-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; LOOKUPTABLE-NEXT:    ]
+; LOOKUPTABLE:       branch_0:
+; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
+; LOOKUPTABLE:       branch_2:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_4:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       branch_6:
+; LOOKUPTABLE-NEXT:    br label [[EXIT]]
+; LOOKUPTABLE:       unreachabledefault:
+; LOOKUPTABLE-NEXT:    unreachable
+; LOOKUPTABLE:       exit:
+; LOOKUPTABLE-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; LOOKUPTABLE-NEXT:    ret i64 [[B]]
+;
+; SWITCH-LABEL: define i64 @bar2(
+; SWITCH-SAME: i64 [[A:%.*]]) {
+; SWITCH-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
+; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
+; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
+; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
+; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
+; SWITCH-NEXT:    ]
+; SWITCH:       branch_0:
+; SWITCH-NEXT:    br label [[EXIT:%.*]]
+; SWITCH:       branch_2:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_4:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       branch_6:
+; SWITCH-NEXT:    br label [[EXIT]]
+; SWITCH:       unreachabledefault:
+; SWITCH-NEXT:    unreachable
+; SWITCH:       exit:
+; SWITCH-NEXT:    [[B:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
+; SWITCH-NEXT:    ret i64 [[B]]
+;
+  switch i64 %a, label %unreachabledefault [
+  i64 0, label %branch_0
+  i64 2, label %branch_2
+  i64 4, label %branch_4
+  i64 6, label %branch_6
+  ]
+
+branch_0:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_4:
+  br label %exit
+
+branch_6:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_2 ], [ 2, %branch_4 ], [ 7, %branch_6 ]
+  ret i64 %b
+}
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
index 81f9fe4a3d18..d13129c1248a 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
@@ -22,7 +22,7 @@ define i1 @p0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @p0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -42,7 +42,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @p1_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <2 x i8> <i8 -1, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use2i8(<2 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <2 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <2 x i8> <i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <2 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
@@ -58,7 +58,7 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
@@ -80,7 +80,7 @@ define i1 @c0(i8 %y) {
 ; CHECK-LABEL: @c0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -98,7 +98,7 @@ define i1 @c1(i8 %y) {
 ; CHECK-LABEL: @c1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -116,7 +116,7 @@ define i1 @c2(i8 %y) {
 ; CHECK-LABEL: @c2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ule i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -138,7 +138,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -156,7 +156,7 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X]]
@@ -175,7 +175,7 @@ define i1 @oneuse2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
@@ -200,7 +200,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 ; CHECK-LABEL: @n0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[NOTX:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
index 321a1159d0af..a1517b36d0b9 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
@@ -22,7 +22,7 @@ define i1 @p0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @p0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -42,7 +42,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @p1_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <2 x i8> <i8 -1, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use2i8(<2 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <2 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <2 x i8> <i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <2 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
@@ -58,7 +58,7 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i8> [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
@@ -80,7 +80,7 @@ define i1 @c0(i8 %y) {
 ; CHECK-LABEL: @c0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -98,7 +98,7 @@ define i1 @c1(i8 %y) {
 ; CHECK-LABEL: @c1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -116,7 +116,7 @@ define i1 @c2(i8 %y) {
 ; CHECK-LABEL: @c2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[X:%.*]] = call i8 @gen8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -138,7 +138,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
@@ -156,7 +156,7 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X]]
@@ -175,7 +175,7 @@ define i1 @oneuse2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T2]])
@@ -200,7 +200,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 ; CHECK-LABEL: @n0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i8 [[T0]], [[Y]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i8 -1, [[Y]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[NOTX:%.*]]
 ; CHECK-NEXT:    ret i1 [[RET]]
diff --git a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
index 24dac97672f7..12c608cd6c2b 100644
--- a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
@@ -1100,8 +1100,8 @@ define i1 @uge_smallest_normal_or_ord(half %x) #0 {
 ; -> nan | pnormal | pinf
 define i1 @uge_smallest_normal_or_uno(half %x) #0 {
 ; CHECK-LABEL: @uge_smallest_normal_or_uno(
-; CHECK-NEXT:    [[CLASS:%.*]] = call i1 @llvm.is.fpclass.f16(half [[X:%.*]], i32 771)
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[CMP_SMALLEST_NORMAL:%.*]] = fcmp uge half [[X:%.*]], 0xH0400
+; CHECK-NEXT:    ret i1 [[CMP_SMALLEST_NORMAL]]
 ;
   %uno = fcmp uno half %x, 0.0
   %cmp.smallest.normal = fcmp uge half %x, 0xH0400
@@ -1307,8 +1307,8 @@ define i1 @oge_fabs_eq_inf_and_ord(half %x) #0 {
 
 define i1 @oge_eq_inf_and_ord(half %x) #0 {
 ; CHECK-LABEL: @oge_eq_inf_and_ord(
-; CHECK-NEXT:    [[AND:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    [[OGE_FABS_INF:%.*]] = fcmp oeq half [[X:%.*]], 0xH7C00
+; CHECK-NEXT:    ret i1 [[OGE_FABS_INF]]
 ;
   %oge.fabs.inf = fcmp oge half %x, 0xH7C00
   %ord = fcmp ord half %x, 0xH0000
@@ -1379,8 +1379,8 @@ define i1 @ult_fabs_eq_inf_or_uno(half %x) #0 {
 
 define i1 @ult_eq_inf_or_uno(half %x) #0 {
 ; CHECK-LABEL: @ult_eq_inf_or_uno(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp une half [[X:%.*]], 0xH7C00
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    [[ULT_FABS_INF:%.*]] = fcmp une half [[X:%.*]], 0xH7C00
+; CHECK-NEXT:    ret i1 [[ULT_FABS_INF]]
 ;
   %ult.fabs.inf = fcmp ult half %x, 0xH7C00
   %uno = fcmp uno half %x, 0xH0000
@@ -1465,8 +1465,8 @@ define i1 @oeq_neginfinity_or_ord(half %x) #0 {
 ; -> ninf
 define i1 @oeq_neginfinity_and_ord(half %x) #0 {
 ; CHECK-LABEL: @oeq_neginfinity_and_ord(
-; CHECK-NEXT:    [[CLASS:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[OEQ_NEG_INFINITY:%.*]] = fcmp oeq half [[X:%.*]], 0xHFC00
+; CHECK-NEXT:    ret i1 [[OEQ_NEG_INFINITY]]
 ;
   %oeq.neg.infinity = fcmp oeq half %x, 0xHFC00
   %ord = fcmp ord half %x, 0.0
@@ -1597,8 +1597,8 @@ define i1 @ueq_neginfinity_and_olt_smallest_normal(half %x) #0 {
 ; -> nan|ninf
 define i1 @ueq_neginfinity_or_uno(half %x) #0 {
 ; CHECK-LABEL: @ueq_neginfinity_or_uno(
-; CHECK-NEXT:    [[CLASS:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00
-; CHECK-NEXT:    ret i1 [[CLASS]]
+; CHECK-NEXT:    [[UEQ_NEG_INFINITY:%.*]] = fcmp ueq half [[X:%.*]], 0xHFC00
+; CHECK-NEXT:    ret i1 [[UEQ_NEG_INFINITY]]
 ;
   %ueq.neg.infinity = fcmp ueq half %x, 0xHFC00
   %uno = fcmp uno half %x, 0.0
diff --git a/llvm/test/Transforms/InstCombine/dependent-ivs.ll b/llvm/test/Transforms/InstCombine/dependent-ivs.ll
new file mode 100644
index 000000000000..bd6679121dca
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/dependent-ivs.ll
@@ -0,0 +1,374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define void @int_iv_nuw(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_nuw(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add nuw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_nsw(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_nsw(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add nsw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_commuted(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_commuted(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE2:%.*]] = mul i64 [[BASE]], 42
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE2]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[BASE2]], [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base2 = mul i64 %base, 42 ; thwart complexity-based canonicalization
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base2, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add i64 %base2, %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_vector(<2 x i64> %base) {
+; CHECK-LABEL: define void @int_iv_vector(
+; CHECK-SAME: <2 x i64> [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi <2 x i64> [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2i64(<2 x i64> [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], <i64 4, i64 4>
+; CHECK-NEXT:    [[IV2_NEXT]] = add <2 x i64> [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi <2 x i64> [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ]
+  call void @use.v2i64(<2 x i64> %iv2)
+  %iv.next = add nuw nsw <2 x i64> %iv, <i64 4, i64 4>
+  %iv2.next = add <2 x i64> %iv.next, %base
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @int_iv_loop_variant_step(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @int_iv_loop_variant_step(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[STEP:%.*]] = call i64 @get.i64()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[STEP]]
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.i64(i64 %iv2)
+  %step = call i64 @get.i64()
+  %iv.next = add nuw nsw i64 %iv, %step
+  %iv2.next = add nuw i64 %iv.next, %base
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_inbounds(ptr %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_inbounds(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.p0(ptr [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.p0(ptr %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr inbounds i8, ptr %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_no_inbounds(ptr %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_no_inbounds(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.p0(ptr [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr i8, ptr [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi ptr [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.p0(ptr %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr i8, ptr %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_vector(<2 x ptr> %base, i64 %end) {
+; CHECK-LABEL: define void @ptr_iv_vector(
+; CHECK-SAME: <2 x ptr> [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2p0(<2 x ptr> [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i8, <2 x ptr> [[BASE]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  call void @use.v2p0(<2 x ptr> %iv.ptr)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv.ptr.next = getelementptr inbounds i8, <2 x ptr> %base, i64 %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ptr_iv_vector2(<2 x ptr> %base) {
+; CHECK-LABEL: define void @ptr_iv_vector2(
+; CHECK-SAME: <2 x ptr> [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.v2p0(<2 x ptr> [[IV_PTR]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], <i64 4, i64 4>
+; CHECK-NEXT:    [[IV_PTR_NEXT]] = getelementptr i8, <2 x ptr> [[BASE]], <2 x i64> [[IV_NEXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.ptr = phi <2 x ptr> [ %iv.ptr.next, %loop ], [ %base, %entry ]
+  %iv = phi <2 x i64> [ %iv.next, %loop ], [ zeroinitializer, %entry ]
+  call void @use.v2p0(<2 x ptr> %iv.ptr)
+  %iv.next = add nuw nsw <2 x i64> %iv, <i64 4, i64 4>
+  %iv.ptr.next = getelementptr i8, <2 x ptr> %base, <2 x i64> %iv.next
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wrong_start_value(i64 %base, i64 %end) {
+; CHECK-LABEL: define void @wrong_start_value(
+; CHECK-SAME: i64 [[BASE:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv2 = phi i64 [ %iv2.next, %loop ], [ %base, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 1, %entry ]
+  call void @use.i64(i64 %iv2)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %iv2.next = add i64 %base, %iv.next
+  %cmp = icmp eq i64 %iv.next, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @different_loops(i64 %base) {
+; CHECK-LABEL: define void @different_loops(
+; CHECK-SAME: i64 [[BASE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop1:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP2:%.*]], label [[LOOP1]]
+; CHECK:       loop2:
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP2]] ], [ [[BASE]], [[LOOP1]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[IV2]])
+; CHECK-NEXT:    [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]]
+; CHECK-NEXT:    [[CMP2:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[CMP2]], label [[EXIT:%.*]], label [[LOOP2]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop1
+
+loop1:
+  %iv = phi i64 [ %iv.next, %loop1 ], [ 0, %entry ]
+  call void @use.i64(i64 %iv)
+  %iv.next = add nuw nsw i64 %iv, 4
+  %cmp = call i1 @get.i1()
+  br i1 %cmp, label %loop2, label %loop1
+
+loop2:
+  %iv2 = phi i64 [ %iv2.next, %loop2 ], [ %base, %loop1 ]
+  call void @use.i64(i64 %iv2)
+  %iv2.next = add nuw i64 %base, %iv.next
+  %cmp2 = call i1 @get.i1()
+  br i1 %cmp2, label %exit, label %loop2
+
+exit:
+  ret void
+}
+
+declare void @use.p0(ptr)
+declare void @use.v2p0(<2 x ptr>)
+declare void @use.i64(i64)
+declare void @use.v2i64(<2 x i64>)
+declare i1 @get.i1()
+declare i64 @get.i64()
diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
new file mode 100644
index 000000000000..346271be7da7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_div_fast(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv double %y, %z
+  %sqrt = call double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_arcp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_reassoc_missing3(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_reassoc_missing3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc arcp double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv arcp reassoc double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_arcp_missing3(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_arcp_missing3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc arcp double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv arcp reassoc double %y, %z
+  %sqrt = call arcp reassoc double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv reassoc double %x, %sqrt
+  ret double %div1
+}
+
+declare void @use(double)
+define double @sqrt_div_fast_multiple_uses_1(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast_multiple_uses_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use(double [[DIV]])
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  call void @use(double %div)
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
+define double @sqrt_div_fast_multiple_uses_2(double %x, double %y, double %z) {
+; CHECK-LABEL: @sqrt_div_fast_multiple_uses_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[DIV]])
+; CHECK-NEXT:    call void @use(double [[SQRT]])
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast double [[X:%.*]], [[SQRT]]
+; CHECK-NEXT:    ret double [[DIV1]]
+;
+entry:
+  %div = fdiv fast double %y, %z
+  %sqrt = call fast double @llvm.sqrt.f64(double %div)
+  call void @use(double %sqrt)
+  %div1 = fdiv fast double %x, %sqrt
+  ret double %div1
+}
+
diff --git a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
index 019db343c61c..42c6506e34cf 100644
--- a/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
+++ b/llvm/test/Transforms/InstCombine/fpclass-check-idioms.ll
@@ -14,6 +14,18 @@ define i1 @f32_fcnan_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 519)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_not_fcnan_fcinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_not_fcnan_fcinf(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -27,6 +39,18 @@ define i1 @f32_not_fcnan_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_not_fcnan_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_not_fcnan_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 504)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp ne i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f64_fcnan_fcinf(double %a) {
 ; CHECK-LABEL: define i1 @f64_fcnan_fcinf(
 ; CHECK-SAME: double [[A:%.*]]) {
@@ -40,13 +64,35 @@ define i1 @f64_fcnan_fcinf(double %a) {
   ret i1 %cmp
 }
 
-; TODO: handle more fpclass check idioms
+define i1 @f64_fcnan_fcinf_strictfp(double %a) strictfp {
+; CHECK-LABEL: define i1 @f64_fcnan_fcinf_strictfp(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f64(double [[A]], i32 519)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast double %a to i64
+  %and = and i64 %i64, 9218868437227405312
+  %cmp = icmp eq i64 %and, 9218868437227405312
+  ret i1 %cmp
+}
+
 define i1 @f32_fcinf(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcinf(
 ; CHECK-SAME: float [[A:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[A]])
-; CHECK-NEXT:    [[AND:%.*]] = bitcast float [[TMP1]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[TMP1]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 516)
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %i32 = bitcast float %a to i32
@@ -55,6 +101,119 @@ define i1 @f32_fcinf(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcposinf(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 512)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcneginf(float %a) {
+; CHECK-LABEL: define i1 @f32_fcneginf(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0xFFF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 4286578688
+  ret i1 %cmp
+}
+
+define i1 @f32_fcneginf_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcneginf_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 4)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 4286578688
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fcnegzero(float %a) {
+; CHECK-LABEL: define i1 @f32_fcnegzero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 32)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2147483648
+  ret i1 %cmp
+}
+
+define i1 @f32_fcnegzero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnegzero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 32)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2147483648
+  ret i1 %cmp
+}
+
+define i1 @f32_fczero(float %a) {
+; CHECK-LABEL: define i1 @f32_fczero(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fczero_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fczero_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 96)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2147483647
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; TODO: handle more fpclass check idioms
 define i1 @f32_fcnan(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -75,17 +234,24 @@ define i1 @f32_fcnan(float %a) {
   ret i1 %res
 }
 
-define i1 @f32_fcnan_fcinf_strictfp(float %a) strictfp {
-; CHECK-LABEL: define i1 @f32_fcnan_fcinf_strictfp(
-; CHECK-SAME: float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[A]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[TMP1]], 0x7FF0000000000000
-; CHECK-NEXT:    ret i1 [[CMP]]
+define i1 @f32_fcnan_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[AND1]], 2139095040
+; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[I32]], 8388607
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[AND2]], 0
+; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %i32 = bitcast float %a to i32
-  %and = and i32 %i32, 2139095040
-  %cmp = icmp eq i32 %and, 2139095040
-  ret i1 %cmp
+  %and1 = and i32 %i32, 2139095040
+  %cmp1 = icmp eq i32 %and1, 2139095040
+  %and2 = and i32 %i32, 8388607
+  %cmp2 = icmp ne i32 %and2, 0
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
 }
 
 define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) {
@@ -101,6 +267,43 @@ define <2 x i1> @f32_fcnan_fcinf_vec(<2 x float> %a) {
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @f32_fcnan_fcinf_vec_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define <2 x i1> @f32_fcnan_fcinf_vec_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[A]], i32 519)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2139095040, i32 2139095040>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @f32_fcinf_vec(<2 x float> %a) {
+; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec(
+; CHECK-SAME: <2 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[TMP1]], <float 0x7FF0000000000000, float 0x7FF0000000000000>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2147483647, i32 2147483647>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @f32_fcinf_vec_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define <2 x i1> @f32_fcinf_vec_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[A]], i32 516)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %i32 = bitcast <2 x float> %a to <2 x i32>
+  %and = and <2 x i32> %i32, <i32 2147483647, i32 2147483647>
+  %cmp = icmp eq <2 x i32> %and, <i32 2139095040, i32 2139095040>
+  ret <2 x i1> %cmp
+}
+
 ; Negative tests
 
 define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) {
@@ -117,6 +320,20 @@ define i1 @f32_fcnan_fcinf_wrong_mask1(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_mask1_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask1_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095041
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095041
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_mask2(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask2(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -131,6 +348,20 @@ define i1 @f32_fcnan_fcinf_wrong_mask2(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_mask2_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_mask2_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2130706432
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2130706432
+  ret i1 %cmp
+}
+
 define i1 @f64_fcnan_fcinf_wrong_mask3(double %a) {
 ; CHECK-LABEL: define i1 @f64_fcnan_fcinf_wrong_mask3(
 ; CHECK-SAME: double [[A:%.*]]) {
@@ -145,6 +376,20 @@ define i1 @f64_fcnan_fcinf_wrong_mask3(double %a) {
   ret i1 %cmp
 }
 
+define i1 @f64_fcnan_fcinf_wrong_mask3_strictfp(double %a) strictfp {
+; CHECK-LABEL: define i1 @f64_fcnan_fcinf_wrong_mask3_strictfp(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast double [[A]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[I64]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast double %a to i64
+  %and = and i64 %i64, 2139095040
+  %cmp = icmp eq i64 %and, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_pred(float %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_pred(
 ; CHECK-SAME: float [[A:%.*]]) {
@@ -158,6 +403,42 @@ define i1 @f32_fcnan_fcinf_wrong_pred(float %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_pred_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_pred_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.is.fpclass.f32(float [[A]], i32 504)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp slt i32 %and, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero_wrong_pred(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I32]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp slt i32 %i32, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero_wrong_pred_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_pred_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I32]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp slt i32 %i32, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1(
 ; CHECK-SAME: <2 x float> [[A:%.*]]) {
@@ -172,6 +453,44 @@ define i1 @f32_fcnan_fcinf_wrong_type1(<2 x float> %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_type1_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type1_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[I64]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %and = and i64 %i64, 2139095040
+  %cmp = icmp eq i64 %and, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_wrong_type1(<2 x float> %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1(
+; CHECK-SAME: <2 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[I64]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %cmp = icmp eq i64 %i64, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_wrong_type1_strictfp(<2 x float> %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_wrong_type1_strictfp(
+; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I64:%.*]] = bitcast <2 x float> [[A]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[I64]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i64 = bitcast <2 x float> %a to i64
+  %cmp = icmp eq i64 %i64, 2139095040
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2(
 ; CHECK-SAME: x86_fp80 [[A:%.*]]) {
@@ -186,6 +505,44 @@ define i1 @f32_fcnan_fcinf_wrong_type2(x86_fp80 %a) {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_wrong_type2_strictfp(x86_fp80 %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_wrong_type2_strictfp(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[AND:%.*]] = and i80 [[I80]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %and = and i80 %i80, 2139095040
+  %cmp = icmp eq i80 %and, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero_wrong_type2(x86_fp80 %a) {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[I80]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %cmp = icmp eq i80 %i80, 0
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposzero_wrong_type2_strictfp(x86_fp80 %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposzero_wrong_type2_strictfp(
+; CHECK-SAME: x86_fp80 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I80:%.*]] = bitcast x86_fp80 [[A]] to i80
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i80 [[I80]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i80 = bitcast x86_fp80 %a to i80
+  %cmp = icmp eq i80 %i80, 0
+  ret i1 %cmp
+}
+
 define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
 ; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat(
 ; CHECK-SAME: float [[A:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -200,4 +557,96 @@ define i1 @f32_fcnan_fcinf_noimplicitfloat(float %a) #0 {
   ret i1 %cmp
 }
 
+define i1 @f32_fcnan_fcinf_noimplicitfloat_strictfp(float %a) strictfp #0 {
+; CHECK-LABEL: define i1 @f32_fcnan_fcinf_noimplicitfloat_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[I32]], 2139095040
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %and = and i32 %i32, 2139095040
+  %cmp = icmp eq i32 %and, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_noimplicitfloat(float %a) #0 {
+; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_noimplicitfloat_strictfp(float %a) strictfp #0 {
+; CHECK-LABEL: define i1 @f32_fcposinf_noimplicitfloat_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposnan(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposnan(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095041
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095041
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposnan_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposnan_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095041
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  %cmp = icmp eq i32 %i32, 2139095041
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_multiuse(float %a) {
+; CHECK-LABEL: define i1 @f32_fcposinf_multiuse(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    call void @usei32(i32 [[I32]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  call void @usei32(i32 %i32)
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+define i1 @f32_fcposinf_multiuse_strictfp(float %a) strictfp {
+; CHECK-LABEL: define i1 @f32_fcposinf_multiuse_strictfp(
+; CHECK-SAME: float [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I32:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    call void @usei32(i32 [[I32]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I32]], 2139095040
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i32 = bitcast float %a to i32
+  call void @usei32(i32 %i32)
+  %cmp = icmp eq i32 %i32, 2139095040
+  ret i1 %cmp
+}
+
+declare void @usei32(i32)
+
 attributes #0 = { noimplicitfloat }
diff --git a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
new file mode 100644
index 000000000000..5d4840159dc9
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @test1(float %x) {
+; CHECK-LABEL: define i1 @test1(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 780)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
+
+define i1 @test2(double %x) {
+; CHECK-LABEL: define i1 @test2(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[X]], 0x3EB0C6F7A0000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = fcmp olt double %x, 0x3EB0C6F7A0000000
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  ret i1 false
+if.end:
+  %cmp.i = fcmp oeq double %x, 0.000000e+00
+  ret i1 %cmp.i
+}
+
+define i1 @test3(float %x) {
+; CHECK-LABEL: define i1 @test3(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RET]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = fcmp ogt float %x, 3.000000e+00
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %abs = call float @llvm.fabs.f32(float %x)
+  %ret = fcmp oeq float %abs, 0x7FF0000000000000
+  ret i1 %ret
+if.else:
+  ret i1 false
+}
+
+define float @test4(float %x) {
+; CHECK-LABEL: define float @test4(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X]], 0x3EB0C6F7A0000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret float 1.000000e+00
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float 1.000000e+00, [[X]]
+; CHECK-NEXT:    ret float [[DIV]]
+;
+entry:
+  %cmp = fcmp olt float %x, 0x3EB0C6F7A0000000
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret float 1.0
+
+if.end:
+  %cmp.i = fcmp oeq float %x, 0.000000e+00
+  %div = fdiv float 1.000000e+00, %x
+  %ret = select i1 %cmp.i, float 1.000000e+00, float %div
+  ret float %ret
+}
+
+define i1 @test5(double %x, i1 %cond) {
+; CHECK-LABEL: define i1 @test5(
+; CHECK-SAME: double [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno double [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Y:%.*]] = phi double [ -1.000000e+00, [[ENTRY:%.*]] ], [ [[X]], [[IF_END]] ]
+; CHECK-NEXT:    [[RET:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[Y]], i32 408)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  br i1 %cond, label %if, label %exit
+if:
+  %cmp = fcmp uno double %x, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  ret i1 false
+if.end:
+  br label %exit
+exit:
+  %y = phi double [ -1.000000e+00, %entry ], [ %x, %if.end ]
+  %ret = tail call i1 @llvm.is.fpclass.f64(double %y, i32 411)
+  ret i1 %ret
+}
+
+define i1 @test6(double %x) {
+; CHECK-LABEL: define i1 @test6(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    br label [[LAND_END]]
+; CHECK:       land.end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_I]], [[LAND_RHS]] ]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp = fcmp ogt double %x, 0.000000e+00
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:
+  %abs = tail call double @llvm.fabs.f64(double %x)
+  %and.i = bitcast double %abs to i64
+  %cmp.i = icmp eq i64 %and.i, 9218868437227405312
+  br label %land.end
+
+land.end:
+  %ret = phi i1 [ false, %entry ], [ %cmp.i, %land.rhs ]
+  ret i1 %ret
+}
+
+define i1 @test7(float %x) {
+; CHECK-LABEL: define i1 @test7(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 345)
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 328)
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 128)
+; CHECK-NEXT:    ret i1 [[RET2]]
+;
+  %cond = call i1 @llvm.is.fpclass.f32(float %x, i32 345)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = call i1 @llvm.is.fpclass.f32(float %x, i32 456)
+  ret i1 %ret1
+if.else:
+  %ret2 = call i1 @llvm.is.fpclass.f32(float %x, i32 456)
+  ret i1 %ret2
+}
+
+; TODO: These two is.fpclass can be simplified.
+define i1 @test8(float %x) {
+; CHECK-LABEL: define i1 @test8(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COND:%.*]] = fcmp oeq float [[ABS]], 0x7FF0000000000000
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 575)
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 575)
+; CHECK-NEXT:    ret i1 [[RET2]]
+;
+  %abs = call float @llvm.fabs.f32(float %x)
+  %cond = fcmp oeq float %abs, 0x7FF0000000000000
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = call i1 @llvm.is.fpclass.f32(float %x, i32 575)
+  ret i1 %ret1
+if.else:
+  %ret2 = call i1 @llvm.is.fpclass.f32(float %x, i32 575)
+  ret i1 %ret2
+}
+
+define i1 @test9(float %x) {
+; CHECK-LABEL: define i1 @test9(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %x, 0x7FF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+define i1 @test10(float %x) {
+; CHECK-LABEL: define i1 @test10(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  %neg = fneg float %x
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %neg, 0xFFF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+; TODO: handle and/or conditions
+define i1 @test11_and(float %x, i1 %cond2) {
+; CHECK-LABEL: define i1 @test11_and(
+; CHECK-SAME: float [[X:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[RET1:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RET1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = fcmp olt float %x, -1.0
+  %neg = fneg float %x
+  %and = and i1 %cond, %cond2
+  br i1 %and, label %if.then, label %if.else
+if.then:
+  %ret1 = fcmp oeq float %neg, 0xFFF0000000000000
+  ret i1 %ret1
+if.else:
+  ret i1 false
+}
+
+; TODO: handle and/or conditions
+define i1 @test12_or(float %x, i1 %cond2) {
+; CHECK-LABEL: define i1 @test12_or(
+; CHECK-SAME: float [[X:%.*]], i1 [[COND2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2]]
+; CHECK-NEXT:    br i1 [[OR]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 783)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  %or = or i1 %cond, %cond2
+  br i1 %or, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
+
+define i1 @test1_no_dominating(float %x, i1 %c) {
+; CHECK-LABEL: define i1 @test1_no_dominating(
+; CHECK-SAME: float [[X:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry0:
+; CHECK-NEXT:    br i1 [[C]], label [[ENTRY:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       entry:
+; CHECK-NEXT:    [[COND:%.*]] = fcmp ueq float [[X]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 783)
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry0:
+  br i1 %c, label %entry, label %if.else
+
+entry:
+  %cond = fcmp ueq float %x, 0.000000e+00
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  ret i1 false
+
+if.else:
+  %ret = call i1 @llvm.is.fpclass.f32(float %x, i32 783)
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
index 491f21467167..a595ddb07db5 100644
--- a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
@@ -40,8 +40,8 @@ define i1 @test59_as1(ptr addrspace(1) %foo) {
 define i1 @test60(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -54,8 +54,8 @@ define i1 @test60(ptr %foo, i64 %i, i64 %j) {
 define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index d912f962354f..a0e03a5428e6 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -313,8 +313,8 @@ define i1 @test_gep_eq_no_inbounds(ptr %foo, i64 %i, i64 %j) {
 define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i16 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[J:%.*]] to i16
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -400,11 +400,13 @@ define i1 @test61_as1(ptr addrspace(1) %foo, i16 %i, i16 %j) {
 
 define i1 @test60_extra_use(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_extra_use(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[FOO]], i64 [[J:%.*]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]]
+; CHECK-NEXT:    [[GEP2_IDX:%.*]] = shl nsw i64 [[J:%.*]], 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[GEP2_IDX]]
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP2]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[GEP2_IDX]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %foo, i64 %i
@@ -446,13 +448,14 @@ define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j)
 
 define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) {
 ; CHECK-LABEL: @test60_extra_use_fold(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[START_IDX:%.*]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[START_IDX:%.*]], 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_IDX]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[END_OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
 ; CHECK-NEXT:    call void @use(ptr [[GEP2]])
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[GEP1_IDX]], [[END_OFFSET]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[CMP1]])
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[GEP1_IDX]], [[END_OFFSET]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[CMP2]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index e346330aa5b1..246579cc4cd0 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -105,8 +105,7 @@ define i8 @test_cond_and(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP]], [[C:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -133,8 +132,7 @@ define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-NEXT:    [[COND:%.*]] = and i1 [[C3]], [[CMP]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -161,8 +159,7 @@ define i8 @test_cond_logical_and(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
@@ -218,8 +215,7 @@ define i8 @test_cond_inv_or(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR1]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR2]]
+; CHECK-NEXT:    ret i8 -4
 ;
   %and = and i8 %x, 3
   %cmp = icmp ne i8 %and, 0
@@ -242,8 +238,7 @@ define i8 @test_cond_inv_logical_or(i8 %x, i1 %c) {
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP_NOT]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 51863af37c13..1b1a6ffbf0f2 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -1938,8 +1938,7 @@ define i32 @test_or_and_and_multiuse(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[AND1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[AND1]])
 ; CHECK-NEXT:    call void @use(i32 [[AND2]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND2]], [[A]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %and1 = and i32 %a, %b
   %and2 = and i32 %and1, %c
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
index 6b7061ff9bd0..48873852cfc7 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -18,7 +18,7 @@ define i32 @t0_basic(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -54,7 +54,7 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t1_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -85,7 +85,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -116,7 +116,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -149,7 +149,7 @@ define i32 @n4_extrause0(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n4_extrause0(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -182,7 +182,7 @@ define i32 @n5_extrause1(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n5_extrause1(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
@@ -215,7 +215,7 @@ define i32 @n6_extrause2(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n6_extrause2(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -33
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
 ; CHECK-NEXT:    call void @use64(i64 [[T1]])
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
index 92805c620683..9c096d1418a5 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
@@ -16,7 +16,7 @@ declare void @use32(i32)
 define i32 @t0_basic(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -43,7 +43,7 @@ declare void @use8xi32(<8 x i32>)
 define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -66,7 +66,7 @@ define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
 define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -89,7 +89,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <8 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 33>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
@@ -114,7 +114,7 @@ define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 define i32 @n3_extrause(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @n3_extrause(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
index bdc7bebf00ec..1a977f67a6a5 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -18,7 +18,7 @@ define i32 @t0_basic(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -32
 ; CHECK-NEXT:    [[T4:%.*]] = and i64 [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
@@ -56,7 +56,7 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t1_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -89,7 +89,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -122,7 +122,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact <8 x i64> [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -157,7 +157,7 @@ define i32 @n4_extrause(i64 %x, i32 %nbits) {
 ; CHECK-LABEL: @n4_extrause(
 ; CHECK-NEXT:    [[T0:%.*]] = zext i32 [[NBITS:%.*]] to i64
 ; CHECK-NEXT:    [[T1:%.*]] = shl nsw i64 -1, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = lshr exact i64 [[T1]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr i64 -1, [[T0]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], -32
 ; CHECK-NEXT:    [[T4:%.*]] = and i64 [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use64(i64 [[T0]])
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
index c91e5a0cc577..549729fe8b59 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
@@ -18,7 +18,7 @@ declare void @use32(i32)
 define i32 @t0_basic(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t0_basic(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -39,7 +39,7 @@ define i32 @t0_basic(i32 %x, i32 %nbits) {
 define i32 @t1_bigger_shift(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t1_bigger_shift(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add i32 [[NBITS]], 1
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
@@ -68,7 +68,7 @@ declare void @use3xi32(<3 x i32>)
 define <3 x i32> @t2_vec_splat(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t2_vec_splat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
@@ -93,7 +93,7 @@ define <3 x i32> @t2_vec_splat(<3 x i32> %x, <3 x i32> %nbits) {
 define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 0, i32 2>
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
@@ -118,7 +118,7 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
 define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-LABEL: @t4_vec_undef(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact <3 x i32> [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T1]])
@@ -147,7 +147,7 @@ define i32 @t5_commutativity0(i32 %nbits) {
 ; CHECK-LABEL: @t5_commutativity0(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @gen32()
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], [[T1]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -169,7 +169,7 @@ define i32 @t5_commutativity0(i32 %nbits) {
 define i32 @t6_commutativity1(i32 %nbits0, i32 %nbits1) {
 ; CHECK-LABEL: @t6_commutativity1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS0:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS0]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS0]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i32 -1, [[NBITS1:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = lshr i32 [[T0]], [[NBITS1]]
 ; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T3]], [[T1]]
@@ -197,7 +197,7 @@ define i32 @t6_commutativity1(i32 %nbits0, i32 %nbits1) {
 define i32 @t7_commutativity2(i32 %nbits0, i32 %nbits1) {
 ; CHECK-LABEL: @t7_commutativity2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS0:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS0]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS0]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i32 -1, [[NBITS1:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = lshr i32 [[T0]], [[NBITS1]]
 ; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T3]], [[T1]]
@@ -228,7 +228,7 @@ define i32 @t7_commutativity2(i32 %nbits0, i32 %nbits1) {
 define i32 @t8_nuw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t8_nuw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -249,7 +249,7 @@ define i32 @t8_nuw(i32 %x, i32 %nbits) {
 define i32 @t9_nsw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t9_nsw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -270,7 +270,7 @@ define i32 @t9_nsw(i32 %x, i32 %nbits) {
 define i32 @t10_nuw_nsw(i32 %x, i32 %nbits) {
 ; CHECK-LABEL: @t10_nuw_nsw(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
@@ -298,7 +298,7 @@ define i32 @t11_assume_uge(i32 %x, i32 %masknbits, i32 %shiftnbits) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[SHIFTNBITS:%.*]], [[MASKNBITS:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i32 -1, [[MASKNBITS]]
-; CHECK-NEXT:    [[T1:%.*]] = lshr exact i32 [[T0]], [[MASKNBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = lshr i32 -1, [[MASKNBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use32(i32 [[T0]])
 ; CHECK-NEXT:    call void @use32(i32 [[T1]])
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index 9817b6e13ca8..5dfeb0734fbb 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -42,7 +42,7 @@ define nofpclass(inf) float @ret_nofpclass_inf_undef() {
 define nofpclass(all) float @ret_nofpclass_all_var(float %arg) {
 ; CHECK-LABEL: define nofpclass(all) float @ret_nofpclass_all_var
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    ret float [[ARG]]
+; CHECK-NEXT:    ret float poison
 ;
   ret float %arg
 }
@@ -51,7 +51,7 @@ define nofpclass(all) float @ret_nofpclass_all_var(float %arg) {
 define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector(<2 x float> %arg) {
 ; CHECK-LABEL: define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector
 ; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
-; CHECK-NEXT:    ret <2 x float> [[ARG]]
+; CHECK-NEXT:    ret <2 x float> poison
 ;
   ret <2 x float> %arg
 }
@@ -65,14 +65,14 @@ define nofpclass(inf) float @ret_nofpclass_inf__0() {
 
 define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0x7FF0000000000000
 }
 
 define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0x7FF0000000000000
 }
@@ -86,7 +86,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__ninf() {
 
 define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
-; CHECK-NEXT:    ret float 0xFFF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
   ret float 0xFFF0000000000000
 }
@@ -106,8 +106,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_inf_lhs(i1 %con
 define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs(i1 %cond, float nofpclass(nan norm zero sub) %x, float %y) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan zero sub norm) [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %select = select i1 %cond, float %x, float %y
   ret float %select
@@ -117,8 +116,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lh
 define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs(i1 %cond, float %x, float nofpclass(nan norm zero sub) %y) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float nofpclass(nan zero sub norm) [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float %x, float %y
   ret float %select
@@ -128,8 +126,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rh
 define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x float]] nofpclass(nan norm zero sub) %x, [3 x [2 x float]] %y) {
 ; CHECK-LABEL: define nofpclass(inf) [3 x [2 x float]] @ret_float_array
 ; CHECK-SAME: (i1 [[COND:%.*]], [3 x [2 x float]] nofpclass(nan zero sub norm) [[X:%.*]], [3 x [2 x float]] [[Y:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], [3 x [2 x float]] [[X]], [3 x [2 x float]] [[Y]]
-; CHECK-NEXT:    ret [3 x [2 x float]] [[SELECT]]
+; CHECK-NEXT:    ret [3 x [2 x float]] [[Y]]
 ;
   %select = select i1 %cond, [3 x [2 x float]] %x, [3 x [2 x float]] %y
   ret [3 x [2 x float ]] %select
@@ -139,8 +136,7 @@ define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x flo
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float 0x7FF0000000000000, float %x
   ret float %select
@@ -150,8 +146,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   ret float %select
@@ -161,8 +156,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float
 define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0xFFF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0x7FF0000000000000, float 0xFFF0000000000000
   ret float %select
@@ -172,8 +166,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, fl
 define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -183,8 +176,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, fl
 define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0x7FF0000000000000
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -194,8 +186,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond,
 define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0xFFF0000000000000
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
   ret float %select
@@ -205,8 +196,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond,
 define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -216,8 +206,7 @@ define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond
 define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -227,8 +216,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %co
 define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float -0.000000e+00
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %select = select i1 %cond, float 0.0, float -0.0
   ret float %select
@@ -238,8 +226,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %co
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>, <2 x float> %x
   ret <2 x float> %select
@@ -249,8 +236,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float poison>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float poison>, <2 x float> %x
   ret <2 x float> %select
@@ -260,8 +246,7 @@ define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_und
 define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
 ; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector
 ; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[COND]], <2 x float> <float 0x7FF0000000000000, float 0xFFF0000000000000>, <2 x float> [[X]]
-; CHECK-NEXT:    ret <2 x float> [[SELECT]]
+; CHECK-NEXT:    ret <2 x float> [[X]]
 ;
   %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0xFFF0000000000000>, <2 x float> %x
   ret <2 x float> %select
@@ -327,8 +312,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_rhs(i1 %cond, float
 define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -338,8 +322,7 @@ define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i
 define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float 0x7FF8000000000000
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float poison
 ;
   %select0 = select i1 %cond, float %x, float 0x7FF8000000000000
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -360,8 +343,7 @@ define nofpclass(nan) float @ret_nofpclass_nan__select_chain_inf_nan(i1 %cond, f
 define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
@@ -371,8 +353,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond,
 define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF8000000000000, float 0x7FF0000000000000
-; CHECK-NEXT:    ret float [[SELECT1]]
+; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
   %select1 = select i1 %cond, float %select0, float 0x7FF0000000000000
@@ -383,8 +364,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond,
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
@@ -396,8 +376,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -421,8 +400,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__fabs_
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -446,9 +424,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_na
 define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
-; CHECK-NEXT:    ret float [[FABS]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -459,8 +435,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_na
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0xFFF0000000000000
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
@@ -472,8 +447,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -485,8 +459,7 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0xFFF0000000000000, float [[X]]
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float %x
@@ -510,8 +483,7 @@ define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -525,8 +497,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %co
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -541,10 +512,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_f
 define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    ret float poison
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -556,8 +524,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -568,8 +535,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -580,8 +546,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rh
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
@@ -594,7 +559,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rh
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -605,7 +571,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign_nnan_flag(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign_nnan_flag
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg nnan float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -616,7 +583,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysig
 define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_nonan_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_nopositives_nonan_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -627,7 +595,7 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_non
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -638,7 +606,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign_nnan_flag(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign_nnan_flag
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -649,7 +617,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysig
 define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan_copysign(float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan_copysign
 ; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
@@ -659,8 +627,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs(i1 %cond, float %x, float %sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -673,8 +640,7 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysi
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -686,8 +652,8 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__
 define nofpclass(inf pnorm psub pzero) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -700,7 +666,7 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__copys
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -713,7 +679,8 @@ define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__copys
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -726,7 +693,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_no
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives_nonan__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -739,7 +706,8 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_no
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[SELECT]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -790,9 +758,7 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nan_negatives__
 define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero(float %x) {
 ; CHECK-LABEL: define nofpclass(nan ninf zero nsub nnorm) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %is.gt.zero = fcmp ogt float %x, 0.0
   %select = select i1 %is.gt.zero, float 0.0, float %x
@@ -803,9 +769,7 @@ define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_ze
 define nofpclass(ninf nnorm nsub zero) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero(float %x) {
 ; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[X]]
 ;
   %is.gt.zero = fcmp ogt float %x, 0.0
   %select = select i1 %is.gt.zero, float 0.0, float %x
@@ -819,8 +783,7 @@ define nofpclass(inf) float @ret_nofpclass_noinfs__assumed_isinf__select_pinf_lh
 ; CHECK-NEXT:    [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    [[X_IS_INF:%.*]] = fcmp oeq float [[FABS_X]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_IS_INF]])
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %x.is.inf = fcmp oeq float %fabs.x, 0x7FF0000000000000
@@ -838,18 +801,13 @@ define nofpclass(nan inf nzero nsub nnorm) float @powr_issue64870(float nofpclas
 ; CHECK-NEXT:    [[I1:%.*]] = tail call float @llvm.log2.f32(float [[I]])
 ; CHECK-NEXT:    [[I2:%.*]] = fmul float [[I1]], [[Y]]
 ; CHECK-NEXT:    [[I3:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[I2]])
-; CHECK-NEXT:    [[I4:%.*]] = fcmp olt float [[Y]], 0.000000e+00
-; CHECK-NEXT:    [[I5:%.*]] = select i1 [[I4]], float 0x7FF0000000000000, float 0.000000e+00
 ; CHECK-NEXT:    [[I6:%.*]] = fcmp oeq float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[I7:%.*]] = select i1 [[I6]], float [[I5]], float [[I3]]
+; CHECK-NEXT:    [[I7:%.*]] = select i1 [[I6]], float 0.000000e+00, float [[I3]]
 ; CHECK-NEXT:    [[I8:%.*]] = fcmp oeq float [[Y]], 0.000000e+00
-; CHECK-NEXT:    [[I9:%.*]] = select i1 [[I6]], float 0x7FF8000000000000, float 1.000000e+00
-; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I8]], float [[I9]], float [[I7]]
 ; CHECK-NEXT:    [[I11:%.*]] = fcmp oeq float [[X]], 1.000000e+00
-; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I11]], float 1.000000e+00, float [[I10]]
-; CHECK-NEXT:    [[I13:%.*]] = fcmp olt float [[X]], 0.000000e+00
-; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I13]], float 0x7FF8000000000000, float [[I12]]
-; CHECK-NEXT:    ret float [[I14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[I11]], i1 true, i1 [[I8]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[TMP0]], float 1.000000e+00, float [[I7]]
+; CHECK-NEXT:    ret float [[I12]]
 ;
 entry:
   %i = tail call float @llvm.fabs.f32(float %x)
@@ -881,12 +839,8 @@ define nofpclass(nan inf nzero nsub nnorm) float @test_powr_issue64870_2(float n
 ; CHECK-NEXT:    [[I4:%.*]] = select i1 [[I]], float 0x7FF8000000000000, float [[ARG1]]
 ; CHECK-NEXT:    [[I5:%.*]] = fmul float [[I4]], [[I3]]
 ; CHECK-NEXT:    [[I6:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float noundef [[I5]])
-; CHECK-NEXT:    [[I7:%.*]] = fcmp olt float [[I4]], 0.000000e+00
-; CHECK-NEXT:    [[I8:%.*]] = select i1 [[I7]], float 0x7FF0000000000000, float 0.000000e+00
-; CHECK-NEXT:    [[I9:%.*]] = fcmp ueq float [[I4]], 0.000000e+00
 ; CHECK-NEXT:    [[I10:%.*]] = fcmp oeq float [[I2]], 0.000000e+00
-; CHECK-NEXT:    [[I11:%.*]] = select i1 [[I9]], float 0x7FF8000000000000, float [[I8]]
-; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I10]], float [[I11]], float [[I6]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I10]], float 0.000000e+00, float [[I6]]
 ; CHECK-NEXT:    ret float [[I12]]
 ;
 bb:
@@ -923,16 +877,10 @@ define nofpclass(nan inf) float @pow_f32(float nofpclass(nan inf) %arg, float no
 ; CHECK-NEXT:    [[I11:%.*]] = and i1 [[I7]], [[I10]]
 ; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I11]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    [[I13:%.*]] = tail call noundef float @llvm.copysign.f32(float noundef [[I4]], float noundef [[I12]])
-; CHECK-NEXT:    [[I14:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    [[I15:%.*]] = select i1 [[I7]], float [[I13]], float 0x7FF8000000000000
-; CHECK-NEXT:    [[I16:%.*]] = select i1 [[I14]], float [[I15]], float [[I13]]
 ; CHECK-NEXT:    [[I17:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    [[I18:%.*]] = fcmp olt float [[ARG1]], 0.000000e+00
-; CHECK-NEXT:    [[I19:%.*]] = xor i1 [[I17]], [[I18]]
-; CHECK-NEXT:    [[I20:%.*]] = select i1 [[I19]], float 0.000000e+00, float 0x7FF0000000000000
 ; CHECK-NEXT:    [[I21:%.*]] = select i1 [[I11]], float [[ARG]], float 0.000000e+00
-; CHECK-NEXT:    [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef [[I20]], float noundef [[I21]])
-; CHECK-NEXT:    [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I16]]
+; CHECK-NEXT:    [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef 0.000000e+00, float noundef [[I21]])
+; CHECK-NEXT:    [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I13]]
 ; CHECK-NEXT:    [[I24:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[I25:%.*]] = fcmp oeq float [[ARG1]], 0.000000e+00
 ; CHECK-NEXT:    [[I26:%.*]] = or i1 [[I24]], [[I25]]
@@ -977,8 +925,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf(i
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[MUST_BE_INF]], float [[Y]]
-; CHECK-NEXT:    ret float [[SELECT]]
+; CHECK-NEXT:    ret float [[Y]]
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   %select = select i1 %cond, float %must.be.inf, float %y
@@ -989,7 +936,7 @@ define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf(i1 %co
 ; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_INF]]
+; CHECK-NEXT:    ret float 0xFFF0000000000000
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   ret float %must.be.inf
@@ -999,7 +946,7 @@ define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf(i1 %co
 ; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_INF]]
+; CHECK-NEXT:    ret float 0x7FF0000000000000
 ;
   %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
   ret float %must.be.inf
@@ -1009,7 +956,7 @@ define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero(i1
 ; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_ZERO]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
   ret float %must.be.zero
@@ -1019,7 +966,7 @@ define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero(i1
 ; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
-; CHECK-NEXT:    ret float [[MUST_BE_ZERO]]
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
   ret float %must.be.zero
@@ -1133,8 +1080,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_0(i1 %cond0, float
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
 ; CHECK:       ret:
-; CHECK-NEXT:    [[PHI_RET:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[LOOP]] ]
-; CHECK-NEXT:    ret float [[PHI_RET]]
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   br i1 %cond0, label %loop, label %ret
@@ -1159,7 +1105,7 @@ define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_1(i1 %cond0, float
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
 ; CHECK:       ret:
-; CHECK-NEXT:    ret float 0x7FF0000000000000
+; CHECK-NEXT:    ret float poison
 ;
 entry:
   br i1 %cond0, label %loop, label %ret
@@ -1180,8 +1126,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__phi_switch_repeated_predecessor(
 ; CHECK-SAME: (i32 [[SWITCH:%.*]], float [[UNKNOWN:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[SWITCH]], label [[RET:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
-; CHECK-NEXT:    i32 1, label [[LOOP]]
+; CHECK-NEXT:      i32 0, label [[LOOP:%.*]]
+; CHECK-NEXT:      i32 1, label [[LOOP]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PHI_LOOP:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[ENTRY]] ], [ [[UNKNOWN]], [[LOOP]] ]
@@ -1211,8 +1157,7 @@ ret:
 define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[SELECT]])
+; CHECK-NEXT:    [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FENCE]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 5c2ea3975f94..1042c413fbb7 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -35,17 +35,55 @@ define ptr @test2() {
   ret ptr %tmp
 }
 
+define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test3_multiple_fields(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 8, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test4_multiple_copy_first_field(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @test5_multiple_copy_more_than_first_field(ptr nocapture %a, ptr nocapture %b) {
+; CHECK-LABEL: @test5_multiple_copy_more_than_first_field(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %b, i64 4, i1 false), !tbaa.struct !7
+  ret void
+}
+
 !0 = !{!"Simple C/C++ TBAA"}
 !1 = !{!"omnipotent char", !0}
 !2 = !{!5, !5, i64 0}
 !3 = !{i64 0, i64 4, !2}
 !4 = !{i64 0, i64 8, null}
 !5 = !{!"float", !0}
+!6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
+!7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
 
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[TBAA0]] = !{!1, !1, i64 0}
-; CHECK: [[META1:![0-9]+]] = !{!"float", !2}
-; CHECK: [[META2:![0-9]+]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
 ;.
diff --git a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
index d898df0b32f2..4b2ff1b3d050 100644
--- a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
+++ b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
@@ -259,3 +259,170 @@ define <2 x i1> @uno8(<2 x double> %x, <2 x double> %y) {
   %r = or <2 x i1> %cmp1, %cmp2
   ret <2 x i1> %r
 }
+
+define i1 @olt_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted1(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted1(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %y, %x
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted2(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted2(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %ord, %olt
+  ret i1 %ret
+}
+
+define i1 @olt_implies_ord_commuted3(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_ord_commuted3(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[OLT]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %olt = fcmp olt float %y, %x
+  %ret = and i1 %ord, %olt
+  ret i1 %ret
+}
+
+define <2 x i1> @olt_implies_ord_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @olt_implies_ord_vec(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OLT]]
+;
+  %ord = fcmp ord <2 x float> %x, zeroinitializer
+  %olt = fcmp olt <2 x float> %x, %y
+  %ret = and <2 x i1> %ord, %olt
+  ret <2 x i1> %ret
+}
+
+define i1 @ord_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @ord_implies_ord(
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[ORD2]]
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ord2 = fcmp ord float %x, %y
+  %ret = and i1 %ord, %ord2
+  ret i1 %ret
+}
+
+define i1 @olt_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_uno(
+; CHECK-NEXT:    ret i1 false
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %uno
+  ret i1 %ret
+}
+
+define i1 @ult_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_uno(
+; CHECK-NEXT:    [[ULT:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[ULT]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = or i1 %ult, %uno
+  ret i1 %ret
+}
+
+define i1 @uno_implies_uno(float %x, float %y) {
+; CHECK-LABEL: @uno_implies_uno(
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[UNO2]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %uno2 = fcmp uno float %x, %y
+  %ret = or i1 %uno, %uno2
+  ret i1 %ret
+}
+
+define i1 @ult_implies_ord(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_ord(
+; CHECK-NEXT:    ret i1 true
+;
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = or i1 %ult, %ord
+  ret i1 %ret
+}
+
+; TODO: %cmp1 is false implies %cmp3 is true
+define float @test_ord_implies_uno(float %x) {
+; CHECK-LABEL: @test_ord_implies_uno(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ord float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp uno float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[SEL]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %cmp2 = fcmp olt float %x, 0.000000e+00
+  %cmp3 = fcmp uno float %x, 0.000000e+00
+  %sel = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  %ret = select i1 %sel, float 0.000000e+00, float %x
+  ret float %ret
+}
+
+; Negative tests
+
+define i1 @olt_implies_ord_fail(float %x, float %y, float %z) {
+; CHECK-LABEL: @olt_implies_ord_fail(
+; CHECK-NEXT:    [[ORD:%.*]] = fcmp ord float [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[OLT]], [[ORD]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %ord = fcmp ord float %x, %z
+  %olt = fcmp olt float %x, %y
+  %ret = and i1 %olt, %ord
+  ret i1 %ret
+}
+
+define i1 @ult_implies_uno_and(float %x, float %y) {
+; CHECK-LABEL: @ult_implies_uno_and(
+; CHECK-NEXT:    [[UNO:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[ULT:%.*]] = fcmp ult float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[ULT]], [[UNO]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %uno = fcmp uno float %x, 0.000000e+00
+  %ult = fcmp ult float %x, %y
+  %ret = and i1 %ult, %uno
+  ret i1 %ret
+}
+
+define i1 @olt_implies_olt_fail(float %x, float %y) {
+; CHECK-LABEL: @olt_implies_olt_fail(
+; CHECK-NEXT:    [[OLT:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[OLT2:%.*]] = fcmp olt float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = and i1 [[OLT]], [[OLT2]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %olt = fcmp olt float %x, 0.000000e+00
+  %olt2 = fcmp olt float %x, %y
+  %ret = and i1 %olt, %olt2
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/JumpTableToSwitch/basic.ll b/llvm/test/Transforms/JumpTableToSwitch/basic.ll
new file mode 100644
index 000000000000..321f837077ab
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/basic.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-size-threshold=0 -verify-dom-info -S | FileCheck %s --check-prefix=THRESHOLD-0
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+; THRESHOLD-0-LABEL: define i32 @function_with_jump_table(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
+define i32 @basic_block_splitted_twice(i32 %index) {
+; CHECK-LABEL: define i32 @basic_block_splitted_twice(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE1:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_02:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_13:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable1:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.02:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL_TAIL:%.*]]
+; CHECK:       call.13:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL_TAIL]]
+; CHECK:       .tail.tail:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP4]], [[CALL_02]] ], [ [[TMP5]], [[CALL_13]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+; THRESHOLD-0-LABEL: define i32 @basic_block_splitted_twice(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR1:%.*]] = load ptr, ptr [[GEP1]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT1:%.*]] = call i32 [[FUNC_PTR1]]()
+; THRESHOLD-0-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR2:%.*]] = load ptr, ptr [[GEP2]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT2:%.*]] = call i32 [[FUNC_PTR2]]()
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = add i32 [[RESULT1]], [[RESULT2]]
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep1 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr1 = load ptr, ptr %gep1
+  %result1 = call i32 %func_ptr1()
+  %gep2 = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr2 = load ptr, ptr %gep2
+  %result2 = call i32 %func_ptr2()
+  %result = add i32 %result1, %result2
+  ret i32 %result
+}
+
+define void @void_func0() {
+  ret void
+}
+
+define void @void_func1() {
+  ret void
+}
+
+@void_func_array = constant [2 x ptr] [ptr @void_func0, ptr @void_func1]
+
+define void @void_function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define void @void_function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    call void @void_func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    call void @void_func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    ret void
+;
+; THRESHOLD-0-LABEL: define void @void_function_with_jump_table(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    call void [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret void
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  call void %func_ptr()
+  ret void
+}
+
+define void @void_function_with_jump_table_and_call_site_attr(i32 %index) {
+; CHECK-LABEL: define void @void_function_with_jump_table_and_call_site_attr(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    call void @void_func0() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    call void @void_func1() #[[ATTR0]]
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    ret void
+;
+; THRESHOLD-0-LABEL: define void @void_function_with_jump_table_and_call_site_attr(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; THRESHOLD-0-NEXT:    call void [[FUNC_PTR]]() #[[ATTR0:[0-9]+]]
+; THRESHOLD-0-NEXT:    ret void
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @void_func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  call void %func_ptr() nounwind
+  ret void
+}
+
+
+define i32 @func0_addrspace_42() addrspace(42) {
+  ret i32 1
+}
+
+define i32 @func1_addrspace_42() addrspace(42) {
+  ret i32 2
+}
+
+@func_array_addrspace_42 = addrspace(42) constant [2 x ptr addrspace(42)] [ptr addrspace(42) @func0_addrspace_42, ptr addrspace(42) @func1_addrspace_42]
+
+define i32 @function_with_jump_table_addrspace_42(i32 %index) addrspace(42) {
+; CHECK-LABEL: define i32 @function_with_jump_table_addrspace_42(
+; CHECK-SAME: i32 [[INDEX:%.*]]) addrspace(42) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call addrspace(42) i32 @func0_addrspace_42()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call addrspace(42) i32 @func1_addrspace_42()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+; THRESHOLD-0-LABEL: define i32 @function_with_jump_table_addrspace_42(
+; THRESHOLD-0-SAME: i32 [[INDEX:%.*]]) addrspace(42) {
+; THRESHOLD-0-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 [[INDEX]]
+; THRESHOLD-0-NEXT:    [[FUNC_PTR:%.*]] = load ptr addrspace(42), ptr addrspace(42) [[GEP]], align 8
+; THRESHOLD-0-NEXT:    [[RESULT:%.*]] = call addrspace(42) i32 [[FUNC_PTR]]()
+; THRESHOLD-0-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr addrspace(42)], ptr addrspace(42) @func_array_addrspace_42, i32 0, i32 %index
+  %func_ptr = load ptr addrspace(42), ptr addrspace(42) %gep, align 8
+  %result = call addrspace(42) i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll
new file mode 100644
index 000000000000..f4e991131bc4
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/max_function_size.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -jump-table-to-switch-function-size-threshold=1 -verify-dom-info -S | FileCheck %s
+
+@func_array0 = constant [2 x ptr] [ptr @func0, ptr @large_func]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @large_func() {
+  %x = add i32 1, 2
+  ret i32 %x
+}
+
+define i32 @function_with_jump_table_with_large_func(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_with_large_func(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/remarks.ll b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll
new file mode 100644
index 000000000000..84d4c1985029
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/remarks.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -passes=jump-table-to-switch -pass-remarks=jump-table-to-switch -S -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: remark: /tmp/tmp.cc:2:20: expanded indirect call into switch
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr(), !dbg !8
+  ret i32 %result
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 18.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/tmp.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 18.0.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 21, scope: !6)
+!10 = !DILocation(line: 2, column: 22, scope: !6)
diff --git a/llvm/test/Transforms/JumpTableToSwitch/skip.ll b/llvm/test/Transforms/JumpTableToSwitch/skip.ll
new file mode 100644
index 000000000000..4504423a0152
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/skip.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+@func_array0 = constant [2 x ptr] [ptr @func0, ptr @declared_only_func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+declare i32 @declared_only_func1()
+
+define i32 @function_with_jump_table_with_a_declared_only_func(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_with_a_declared_only_func(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]]()
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+define i32 @function_with_jump_table_invoke(i32 %index) personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define i32 @function_with_jump_table_invoke(
+; CHECK-SAME: i32 [[INDEX:%.*]]) personality ptr @__gxx_personality_v0 {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = invoke i32 [[FUNC_PTR]]()
+; CHECK-NEXT:            to label [[NORMAL:%.*]] unwind label [[EXCEPTIONAL:%.*]]
+; CHECK:       normal:
+; CHECK-NEXT:    ret i32 [[RESULT]]
+; CHECK:       exceptional:
+; CHECK-NEXT:    [[LANDING_PAD:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            catch ptr null
+; CHECK-NEXT:    resume { ptr, i32 } [[LANDING_PAD]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array0, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = invoke i32 %func_ptr() to label %normal unwind label %exceptional
+normal:
+  ret i32 %result
+exceptional:
+  %landing_pad = landingpad { ptr, i32 } catch ptr null
+  resume { ptr, i32 } %landing_pad
+}
+
+@func_array1 = constant [1 x ptr] [ptr @func2]
+
+define i32 @func2(i32 %arg) {
+  ret i32 %arg
+}
+
+define i32 @function_with_jump_table_musttail_call(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_musttail_call(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = musttail call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = musttail call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+define i32 @function_with_jump_table_and_volatile_load(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_and_volatile_load(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load volatile ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load volatile ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+define i32 @function_with_jump_table_and_atomic_load(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table_and_atomic_load(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load atomic ptr, ptr [[GEP]] monotonic, align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array1, i32 0, i32 %index
+  %func_ptr = load atomic ptr, ptr %gep monotonic, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+@func_array2 = global [1 x ptr] [ptr @func2]
+
+define i32 @function_with_nonconstant_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_nonconstant_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array2, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
+@func_array3 = weak constant [1 x ptr] [ptr @func2]
+
+define i32 @function_with_constant_weak_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_constant_weak_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 [[FUNC_PTR]](i32 [[INDEX]])
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %gep = getelementptr inbounds [1 x ptr], ptr @func_array3, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep, align 8
+  %result = call i32 %func_ptr(i32 %index)
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/stride.ll b/llvm/test/Transforms/JumpTableToSwitch/stride.ll
new file mode 100644
index 000000000000..ef86e9d1b9b8
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/stride.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+@func_array = constant [2 x ptr] [ptr @func0, ptr @func1]
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @check_stride(i32 %index) {
+; CHECK-LABEL: define i32 @check_stride(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %gep = getelementptr inbounds [2 x { ptr, ptr }], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/JumpTableToSwitch/struct.ll b/llvm/test/Transforms/JumpTableToSwitch/struct.ll
new file mode 100644
index 000000000000..7aa709c402fe
--- /dev/null
+++ b/llvm/test/Transforms/JumpTableToSwitch/struct.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=jump-table-to-switch -verify-dom-info -S | FileCheck %s
+
+%"struct_ty" = type { [2 x ptr] }
+
+@func_array = constant %"struct_ty" { [2 x ptr] [ptr @func0, ptr @func1] }
+
+define i32 @func0() {
+  ret i32 1
+}
+
+define i32 @func1() {
+  ret i32 2
+}
+
+define i32 @function_with_jump_table(i32 %index) {
+; CHECK-LABEL: define i32 @function_with_jump_table(
+; CHECK-SAME: i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[FUNC_PTR:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    switch i32 [[INDEX]], label [[DEFAULT_SWITCH_CASE_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[CALL_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CALL_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default.switch.case.unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       call.0:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @func0()
+; CHECK-NEXT:    br label [[DOTTAIL:%.*]]
+; CHECK:       call.1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @func1()
+; CHECK-NEXT:    br label [[DOTTAIL]]
+; CHECK:       .tail:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[CALL_0]] ], [ [[TMP2]], [[CALL_1]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @func_array, i32 0, i32 %index
+  %func_ptr = load ptr, ptr %gep
+  %result = call i32 %func_ptr()
+  ret i32 %result
+}
+
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
new file mode 100644
index 000000000000..63548974fb31
--- /dev/null
+++ b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='loop-mssa(licm)' -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CONSTRAINED
+; RUN: opt -passes='loop-mssa(licm)' -licm-max-num-int-reassociations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CONSTRAINED
+
+;
+; A simple loop:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1 * cells[j + 1] * delta;
+;
+; ...should be transformed by the LICM pass into this:
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1];
+;
+
+define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[MUL_1]], [[CELL_1]]
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %delta, %d1
+  %mul.2 = mul i64 %mul.1, %cell.1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  store i64 %mul.2, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_2d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD]], ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_2d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.mul = mul i64 %reass.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will be modified by the 'Reassociate expressions' pass,
+;
+;  int j;
+;  const uint64_t d1d = d1 * delta;
+;  const uint64_t d2d = d2 * delta;
+;  const uint64_t d3d = d3 * delta;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = d1d * cells[j + 1] + d2d * cells[j] + d3d * cells[j + 2];
+;
+; ...into this:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] + d3 * cells[j + 2]) * delta;
+;
+; We expect the LICM pass to undo this transformation.
+;
+
+
+define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, ptr %cells) {
+; NOT_CONSTRAINED-LABEL: define void @innermost_loop_3d
+; NOT_CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; NOT_CONSTRAINED-NEXT:  entry:
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i64 [[D3]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL1:%.*]] = mul i64 [[D1]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    [[FACTOR_OP_MUL2:%.*]] = mul i64 [[D2]], [[DELTA]]
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; NOT_CONSTRAINED:       for.cond:
+; NOT_CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; NOT_CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; NOT_CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NOT_CONSTRAINED:       for.body:
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[FACTOR_OP_MUL1]]
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[FACTOR_OP_MUL2]]
+; NOT_CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; NOT_CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; NOT_CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; NOT_CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[FACTOR_OP_MUL]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; NOT_CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; NOT_CONSTRAINED-NEXT:    store i64 [[REASS_ADD1]], ptr [[ARRAYIDX_J_2]], align 8
+; NOT_CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; NOT_CONSTRAINED:       for.end:
+; NOT_CONSTRAINED-NEXT:    ret void
+;
+; CONSTRAINED-LABEL: define void @innermost_loop_3d
+; CONSTRAINED-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[D3:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CONSTRAINED-NEXT:  entry:
+; CONSTRAINED-NEXT:    br label [[FOR_COND:%.*]]
+; CONSTRAINED:       for.cond:
+; CONSTRAINED-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CONSTRAINED-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CONSTRAINED-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CONSTRAINED:       for.body:
+; CONSTRAINED-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CONSTRAINED-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CONSTRAINED-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CONSTRAINED-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[ADD_J_2:%.*]] = add nuw nsw i32 [[J]], 2
+; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
+; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
+; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
+; CONSTRAINED-NEXT:    br label [[FOR_COND]]
+; CONSTRAINED:       for.end:
+; CONSTRAINED-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.2 = load i64, ptr %arrayidx.j, align 8
+  %mul.2 = mul i64 %cell.2, %d2
+  %add.j.2 = add nuw nsw i32 %j, 2
+  %idxprom.j.2 = zext i32 %add.j.2 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %mul.3 = mul i64 %cell.3, %d3
+  %reass.add = add i64 %mul.2, %mul.1
+  %reass.add1 = add i64 %reass.add, %mul.3
+  %reass.mul = mul i64 %reass.add1, %delta
+  store i64 %reass.mul, ptr %arrayidx.j.2, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+;
+; The following loop will not be modified by the LICM pass:
+;
+;  int j;
+;
+;  for (j = 0; j <= i; j++)
+;    cells[j] = (d1 * cells[j + 1] + d2 * cells[j] +
+;                cells[j] * cells[j + 1]) * delta;
+;
+; This case differs as one of the multiplications involves no invariants.
+;
+
+define void @innermost_loop_3d_reassociated_different(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells) {
+; CHECK-LABEL: define void @innermost_loop_3d_reassociated_different
+; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[D2:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i32 [[J]], [[I]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD_J_1]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[IDXPROM_J_1:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_1:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_1]]
+; CHECK-NEXT:    [[CELL_1:%.*]] = load i64, ptr [[ARRAYIDX_J_1]], align 8
+; CHECK-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
+; CHECK-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
+; CHECK-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
+; CHECK-NEXT:    [[CELL_4:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[CELL_1]], [[D1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_4]], [[D2]]
+; CHECK-NEXT:    [[EXTRA_MUL:%.*]] = mul i64 [[CELL_3]], [[CELL_2]]
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i64 [[EXTRA_MUL]], [[MUL_1]]
+; CHECK-NEXT:    [[EXTRA_ADD:%.*]] = add i64 [[REASS_ADD]], [[MUL_2]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[EXTRA_ADD]], [[DELTA]]
+; CHECK-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %j = phi i32 [ 0, %entry ], [ %add.j.1, %for.body ]
+  %cmp.not = icmp sgt i32 %j, %i
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.body:
+  %add.j.1 = add nuw nsw i32 %j, 1
+  %idxprom.j.1 = zext i32 %add.j.1 to i64
+  %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
+  %cell.1 = load i64, ptr %arrayidx.j.1, align 8
+  %idxprom.j.2 = zext i32 %add.j.1 to i64
+  %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
+  %cell.2 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j.3 = zext i32 %add.j.1 to i64
+  %cell.3 = load i64, ptr %arrayidx.j.2, align 8
+  %idxprom.j = zext i32 %j to i64
+  %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
+  %cell.4 = load i64, ptr %arrayidx.j, align 8
+  %mul.1 = mul i64 %cell.1, %d1
+  %mul.2 = mul i64 %cell.4, %d2
+  %extra.mul = mul i64 %cell.3, %cell.2
+  %reass.add = add i64 %extra.mul, %mul.1
+  %extra.add = add i64 %reass.add, %mul.2
+  %reass.mul = mul i64 %extra.add, %delta
+  store i64 %reass.mul, ptr %arrayidx.j, align 8
+  br label %for.cond
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index bf2cc77e6f47..38577a5a1256 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -3,7 +3,7 @@
 
 ; RUN: opt < %s -passes=licm -S   | FileCheck %s
 
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 define ptr @test1(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-LABEL: @test1(
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index 2feca405eea7..ac42acb932fc 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -36,7 +36,7 @@ define void @foo(ptr %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
@@ -143,7 +143,7 @@ define void @foo2_sext(ptr nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    tail call void @g(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond1.preheader:
@@ -1005,7 +1005,7 @@ define void @foo_M_sext(ptr %A, i32 %N, i16 %M) {
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
index 946b98420249..7b1caa70387f 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
@@ -45,12 +45,12 @@ define dso_local i32 @fn1() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP7]]
 ; CHECK-NEXT:    store i32 32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add nuw nsw i64 [[INDVAR]], 1
 ; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_014_US]], 1
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY3_US]], label [[FOR_COND1_FOR_INC4_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.inc4_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i64 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[INC5_US]] = add nuw nsw i32 [[I_016_US]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END6_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
index df8ee6ff0750..6e6c045661c2 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -25,7 +25,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    ret i16 [[ADD5_LCSSA_LCSSA]]
 ; CHECK:       for.cond.cleanup3:
 ; CHECK-NEXT:    [[ADD5_LCSSA]] = phi i16 [ [[ADD5:%.*]], [[FOR_BODY4]] ]
-; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i32 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add nuw nsw i32 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[INC7]] = add nuw nsw i16 [[I_013]], 1
 ; CHECK-NEXT:    [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], 4
 ; CHECK-NEXT:    br i1 [[EXITCOND14_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -39,7 +39,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], ptr @v, i16 0, i16 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add nuw nsw i32 [[INDVAR]], 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i16 [[J_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]]
diff --git a/llvm/test/Transforms/LoopReroll/basic.ll b/llvm/test/Transforms/LoopReroll/basic.ll
deleted file mode 100644
index 92d345650575..000000000000
--- a/llvm/test/Transforms/LoopReroll/basic.ll
+++ /dev/null
@@ -1,976 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; int foo(int a);
-; void bar(int *x) {
-;   for (int i = 0; i < 500; i += 3) {
-;     foo(i);
-;     foo(i+1);
-;     foo(i+2);
-;   }
-; }
-
-define void @bar(ptr nocapture readnone %x) #0 {
-; CHECK-LABEL: define void @bar
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[INDVAR]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %call = tail call i32 @foo(i32 %i.08) #1
-  %add = add nsw i32 %i.08, 1
-  %call1 = tail call i32 @foo(i32 %add) #1
-  %add2 = add nsw i32 %i.08, 2
-  %call3 = tail call i32 @foo(i32 %add2) #1
-  %add3 = add nsw i32 %i.08, 3
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-declare i32 @foo(i32)
-
-; void hi1(int *x) {
-;   for (int i = 0; i < 1500; i += 3) {
-;     x[i] = foo(0);
-;     x[i+1] = foo(0);
-;     x[i+2] = foo(0);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hi1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @hi1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVAR]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %call = tail call i32 @foo(i32 0) #1
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %call, ptr %arrayidx, align 4
-  %call1 = tail call i32 @foo(i32 0) #1
-  %0 = add nsw i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call1, ptr %arrayidx3, align 4
-  %call4 = tail call i32 @foo(i32 0) #1
-  %1 = add nsw i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call4, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
-  %2 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %2, 1500
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void hi2(int *x) {
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = foo(0);
-;     x[3*i+1] = foo(0);
-;     x[3*i+2] = foo(0);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hi2(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @hi2
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %call = tail call i32 @foo(i32 0) #1
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %call1 = tail call i32 @foo(i32 0) #1
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call1, ptr %arrayidx4, align 4
-  %call5 = tail call i32 @foo(i32 0) #1
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call5, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void goo(float alpha, float *a, float *b) {
-;   for (int i = 0; i < 3200; i += 5) {
-;     a[i] += alpha * b[i];
-;     a[i + 1] += alpha * b[i + 1];
-;     a[i + 2] += alpha * b[i + 2];
-;     a[i + 3] += alpha * b[i + 3];
-;     a[i + 4] += alpha * b[i + 4];
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @goo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b) #0 {
-; CHECK-LABEL: define void @goo
-; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[ALPHA]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %mul = fmul float %0, %alpha
-  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %1, %mul
-  store float %add, ptr %arrayidx2, align 4
-  %2 = add nsw i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2
-  %3 = load float, ptr %arrayidx5, align 4
-  %mul6 = fmul float %3, %alpha
-  %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2
-  %4 = load float, ptr %arrayidx9, align 4
-  %add10 = fadd float %4, %mul6
-  store float %add10, ptr %arrayidx9, align 4
-  %5 = add nsw i64 %indvars.iv, 2
-  %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5
-  %6 = load float, ptr %arrayidx13, align 4
-  %mul14 = fmul float %6, %alpha
-  %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5
-  %7 = load float, ptr %arrayidx17, align 4
-  %add18 = fadd float %7, %mul14
-  store float %add18, ptr %arrayidx17, align 4
-  %8 = add nsw i64 %indvars.iv, 3
-  %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8
-  %9 = load float, ptr %arrayidx21, align 4
-  %mul22 = fmul float %9, %alpha
-  %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8
-  %10 = load float, ptr %arrayidx25, align 4
-  %add26 = fadd float %10, %mul22
-  store float %add26, ptr %arrayidx25, align 4
-  %11 = add nsw i64 %indvars.iv, 4
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11
-  %12 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %12, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %13 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %13, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
-  %14 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %14, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void hoo(float alpha, float *a, float *b, int *ip) {
-;   for (int i = 0; i < 3200; i += 5) {
-;     a[i] += alpha * b[ip[i]];
-;     a[i + 1] += alpha * b[ip[i + 1]];
-;     a[i + 2] += alpha * b[ip[i + 2]];
-;     a[i + 3] += alpha * b[ip[i + 3]];
-;     a[i + 4] += alpha * b[ip[i + 4]];
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @hoo(float %alpha, ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %ip) #0 {
-; CHECK-LABEL: define void @hoo
-; CHECK-SAME: (float [[ALPHA:%.*]], ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[IP:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IP]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP2]], [[ALPHA]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %ip, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %idxprom1 = sext i32 %0 to i64
-  %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %idxprom1
-  %1 = load float, ptr %arrayidx2, align 4
-  %mul = fmul float %1, %alpha
-  %arrayidx4 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %2 = load float, ptr %arrayidx4, align 4
-  %add = fadd float %2, %mul
-  store float %add, ptr %arrayidx4, align 4
-  %3 = add nsw i64 %indvars.iv, 1
-  %arrayidx7 = getelementptr inbounds i32, ptr %ip, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %idxprom8 = sext i32 %4 to i64
-  %arrayidx9 = getelementptr inbounds float, ptr %b, i64 %idxprom8
-  %5 = load float, ptr %arrayidx9, align 4
-  %mul10 = fmul float %5, %alpha
-  %arrayidx13 = getelementptr inbounds float, ptr %a, i64 %3
-  %6 = load float, ptr %arrayidx13, align 4
-  %add14 = fadd float %6, %mul10
-  store float %add14, ptr %arrayidx13, align 4
-  %7 = add nsw i64 %indvars.iv, 2
-  %arrayidx17 = getelementptr inbounds i32, ptr %ip, i64 %7
-  %8 = load i32, ptr %arrayidx17, align 4
-  %idxprom18 = sext i32 %8 to i64
-  %arrayidx19 = getelementptr inbounds float, ptr %b, i64 %idxprom18
-  %9 = load float, ptr %arrayidx19, align 4
-  %mul20 = fmul float %9, %alpha
-  %arrayidx23 = getelementptr inbounds float, ptr %a, i64 %7
-  %10 = load float, ptr %arrayidx23, align 4
-  %add24 = fadd float %10, %mul20
-  store float %add24, ptr %arrayidx23, align 4
-  %11 = add nsw i64 %indvars.iv, 3
-  %arrayidx27 = getelementptr inbounds i32, ptr %ip, i64 %11
-  %12 = load i32, ptr %arrayidx27, align 4
-  %idxprom28 = sext i32 %12 to i64
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %idxprom28
-  %13 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %13, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %14 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %14, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %15 = add nsw i64 %indvars.iv, 4
-  %arrayidx37 = getelementptr inbounds i32, ptr %ip, i64 %15
-  %16 = load i32, ptr %arrayidx37, align 4
-  %idxprom38 = sext i32 %16 to i64
-  %arrayidx39 = getelementptr inbounds float, ptr %b, i64 %idxprom38
-  %17 = load float, ptr %arrayidx39, align 4
-  %mul40 = fmul float %17, %alpha
-  %arrayidx43 = getelementptr inbounds float, ptr %a, i64 %15
-  %18 = load float, ptr %arrayidx43, align 4
-  %add44 = fadd float %18, %mul40
-  store float %add44, ptr %arrayidx43, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
-  %19 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %19, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi1(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = y;
-;     x[3*i+1] = y;
-;     x[3*i+2] = y;
-;     x[3*i+6] = y;
-;     x[3*i+7] = y;
-;     x[3*i+8] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 6
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %3 = add nsw i64 %0, 6
-  %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %3
-  store i32 %call, ptr %arrayidx6, align 4
-  %4 = add nsw i64 %0, 7
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %4
-  store i32 %call, ptr %arrayidx7, align 4
-  %5 = add nsw i64 %0, 8
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %5
-  store i32 %call, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi2(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     x[3*i] = y;
-;     x[3*i+1] = y;
-;     x[3*i+2] = y;
-;     x[3*(i+1)] = y;
-;     x[3*(i+1)+1] = y;
-;     x[3*(i+1)+2] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi2(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi2
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %add = add nsw i64 %indvars.iv, 1
-  %newmul = mul nsw i64 %add, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %arrayidx6 = getelementptr inbounds i32, ptr %x, i64 %newmul
-  store i32 %call, ptr %arrayidx6, align 4
-  %3 = add nsw i64 %newmul, 1
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  store i32 %call, ptr %arrayidx7, align 4
-  %4 = add nsw i64 %newmul, 2
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %4
-  store i32 %call, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; void multi3(int *x) {
-;   y = foo(0)
-;   for (int i = 0; i < 500; ++i) {
-;     // Note: No zero index
-;     x[3*i+3] = y;
-;     x[3*i+4] = y;
-;     x[3*i+5] = y;
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @multi3(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @multi3
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %x0 = add nsw i64 %0, 3
-  %add = add nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %x0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 5
-  %arrayidx9 = getelementptr inbounds i32, ptr %x, i64 %2
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; int foo(int a);
-; void bar2(int *x, int y, int z) {
-;   for (int i = 0; i < 500; i += 3) {
-;     foo(i+y+i*z); // Slightly reordered instruction order
-;     foo(i+1+y+(i+1)*z);
-;     foo(i+2+y+(i+2)*z);
-;   }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @bar2(ptr nocapture readnone %x, i32 %y, i32 %z) #0 {
-; CHECK-LABEL: define void @bar2
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDVAR]], [[Y]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[INDVAR]], [[Z]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[TMP3]]) #[[ATTR1]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i32 [[INDVAR]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-
-  %tmp1 = add i32 %i.08, %y
-  %tmp2 = mul i32 %i.08, %z
-  %tmp3 = add i32 %tmp2, %tmp1
-  %call = tail call i32 @foo(i32 %tmp3) #1
-
-  %add = add nsw i32 %i.08, 1
-  %tmp2a = mul i32 %add, %z
-  %tmp1a = add i32 %add, %y
-  %tmp3a = add i32 %tmp2a, %tmp1a
-  %calla = tail call i32 @foo(i32 %tmp3a) #1
-
-  %add2 = add nsw i32 %i.08, 2
-  %tmp2b = mul i32 %add2, %z
-  %tmp1b = add i32 %add2, %y
-  %tmp3b = add i32 %tmp2b, %tmp1b
-  %callb = tail call i32 @foo(i32 %tmp3b) #1
-
-  %add3 = add nsw i32 %i.08, 3
-
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-%struct.s = type { i32, i32 }
-
-; Function Attrs: nounwind uwtable
-define void @gep1(ptr nocapture %x) #0 {
-; CHECK-LABEL: define void @gep1
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[X]], i64 [[TMP0]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[X]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds %struct.s, ptr %x, i64 %0, i32 0
-  store i32 %call, ptr %arrayidx, align 4
-  %1 = add nsw i64 %0, 1
-  %arrayidx4 = getelementptr inbounds %struct.s, ptr %x, i64 %1, i32 0
-  store i32 %call, ptr %arrayidx4, align 4
-  %2 = add nsw i64 %0, 2
-  %arrayidx9 = getelementptr inbounds %struct.s, ptr %x, i64 %2, i32 0
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-; This test is a crash test only.
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @gep-indexing(ptr nocapture %x) {
-; CHECK-LABEL: define void @gep-indexing
-; CHECK-SAME: (ptr nocapture [[X:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[X]], i64 [[TMP0]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1499
-; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %call = tail call i32 @foo(i32 0) #1
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %0
-  store i32 %call, ptr %arrayidx, align 4
-  %arrayidx4 = getelementptr inbounds i32, ptr %arrayidx, i64 1
-  store i32 %call, ptr %arrayidx4, align 4
-  %arrayidx9 = getelementptr inbounds i32, ptr %arrayidx, i64 2
-  store i32 %call, ptr %arrayidx9, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-
-define void @unordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) {
-; CHECK-LABEL: define void @unordered_atomic_ops
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVAR]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVAR]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], 3199
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  store atomic i32 %va, ptr %buf1_a unordered, align 4
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @unordered_atomic_ops_nomatch(ptr noalias %buf_0, ptr noalias %buf_1) {
-; Negative test
-; CHECK-LABEL: define void @unordered_atomic_ops_nomatch
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4
-; CHECK-NEXT:    store i32 [[VA]], ptr [[BUF1_A]], align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  store i32 %va, ptr %buf1_a, align 4  ;; Not atomic
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @ordered_atomic_ops(ptr noalias %buf_0, ptr noalias %buf_1) {
-; Negative test
-; CHECK-LABEL: define void @ordered_atomic_ops
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] acquire, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] acquire, align 4
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] release, align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] release, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a acquire, align 4
-  %vb = load atomic i32, ptr %buf0_b acquire, align 4
-  store atomic i32 %va, ptr %buf1_a release, align 4
-  store atomic i32 %vb, ptr %buf1_b release, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @unordered_atomic_ops_with_fence(ptr noalias %buf_0, ptr noalias %buf_1) {
-; CHECK-LABEL: define void @unordered_atomic_ops_with_fence
-; CHECK-SAME: (ptr noalias [[BUF_0:%.*]], ptr noalias [[BUF_1:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[INDVARS_MID:%.*]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[BUF0_A:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF0_B:%.*]] = getelementptr i32, ptr [[BUF_0]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[BUF1_A:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[BUF1_B:%.*]] = getelementptr i32, ptr [[BUF_1]], i32 [[INDVARS_MID]]
-; CHECK-NEXT:    [[VA:%.*]] = load atomic i32, ptr [[BUF0_A]] unordered, align 4
-; CHECK-NEXT:    [[VB:%.*]] = load atomic i32, ptr [[BUF0_B]] unordered, align 4
-; CHECK-NEXT:    fence seq_cst
-; CHECK-NEXT:    store atomic i32 [[VA]], ptr [[BUF1_A]] unordered, align 4
-; CHECK-NEXT:    store atomic i32 [[VB]], ptr [[BUF1_B]] unordered, align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVARS_IV_NEXT]], 3200
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-
-  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %indvars.iv.next = add i32 %indvars.iv, 2
-  %indvars.mid = add i32 %indvars.iv, 1
-  %buf0_a = getelementptr i32, ptr %buf_0, i32 %indvars.iv
-  %buf0_b = getelementptr i32, ptr %buf_0, i32 %indvars.mid
-  %buf1_a = getelementptr i32, ptr %buf_1, i32 %indvars.iv
-  %buf1_b = getelementptr i32, ptr %buf_1, i32 %indvars.mid
-  %va = load atomic i32, ptr %buf0_a unordered, align 4
-  %vb = load atomic i32, ptr %buf0_b unordered, align 4
-  fence seq_cst
-  store atomic i32 %va, ptr %buf1_a unordered, align 4
-  store atomic i32 %vb, ptr %buf1_b unordered, align 4
-  %cmp = icmp slt i32 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-define void @pointer_bitcast_baseinst(ptr %arg, ptr %arg1, i64 %arg2) {
-; CHECK-LABEL: define void @pointer_bitcast_baseinst
-; CHECK-SAME: (ptr [[ARG:%.*]], ptr [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG2]], -17
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[BB3]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[INDVAR]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
-; CHECK-NEXT:    [[INST5:%.*]] = shl nuw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[INST6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INST5]]
-; CHECK-NEXT:    [[INST8:%.*]] = load <8 x i16>, ptr [[INST6]], align 2
-; CHECK-NEXT:    [[INST13:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[TMP5]]
-; CHECK-NEXT:    store <8 x i16> [[INST8]], ptr [[INST13]], align 2
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB19:%.*]], label [[BB3]]
-; CHECK:       bb19:
-; CHECK-NEXT:    ret void
-;
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb3, %bb
-  %inst = phi i64 [ 1, %bb ], [ %inst17, %bb3 ]
-  %inst4 = add nuw i64 %inst, 8
-  %inst5 = shl nuw i64 %inst, 1
-  %inst6 = getelementptr i8, ptr %arg1, i64 %inst5
-  %inst8 = load <8 x i16>, ptr %inst6, align 2
-  %inst9 = shl i64 %inst4, 1
-  %inst10 = getelementptr i8, ptr %arg1, i64 %inst9
-  %inst12 = load <8 x i16>, ptr %inst10, align 2
-  %inst13 = getelementptr i16, ptr %arg, i64 %inst
-  store <8 x i16> %inst8, ptr %inst13, align 2
-  %inst15 = getelementptr i16, ptr %arg, i64 %inst4
-  store <8 x i16> %inst12, ptr %inst15, align 2
-  %inst17 = add nuw nsw i64 %inst, 16
-  %inst18 = icmp eq i64 %inst17, %arg2
-  br i1 %inst18, label %bb19, label %bb3
-
-bb19:                                             ; preds = %bb3
-  ret void
-}
-
-define void @bad_step(ptr nocapture readnone %x) #0 {
-; CHECK-LABEL: define void @bad_step
-; CHECK-SAME: (ptr nocapture readnone [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo(i32 [[I_08]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[I_08]], 2
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @foo(i32 [[ADD]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[I_08]], 3
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @foo(i32 [[ADD2]]) #[[ATTR1]]
-; CHECK-NEXT:    [[ADD3]] = add nsw i32 [[I_08]], 6
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sge i32 [[ADD3]], 500
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
-  %call = tail call i32 @foo(i32 %i.08) #1
-  %add = add nsw i32 %i.08, 2
-  %call1 = tail call i32 @foo(i32 %add) #1
-  %add2 = add nsw i32 %i.08, 3
-  %call3 = tail call i32 @foo(i32 %add2) #1
-  %add3 = add nsw i32 %i.08, 6
-  %exitcond = icmp sge i32 %add3, 500
-  br i1 %exitcond, label %for.end, label %for.body
-
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-@a = external global [2 x [512 x i64]], align 16
-@b = external global [512 x [4 x i64]], align 16
-
-define void @ptr_step_crash() {
-; CHECK-LABEL: define void @ptr_step_crash() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY42_3:%.*]]
-; CHECK:       for.body42.3:
-; CHECK-NEXT:    [[K_2207_3:%.*]] = phi i32 [ -512, [[ENTRY:%.*]] ], [ [[INC63_3:%.*]], [[FOR_BODY42_3]] ]
-; CHECK-NEXT:    [[SUB46_3:%.*]] = add nsw i32 [[K_2207_3]], 512
-; CHECK-NEXT:    [[IDXPROM47_3:%.*]] = zext i32 [[SUB46_3]] to i64
-; CHECK-NEXT:    [[ARRAYIDX48_3:%.*]] = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 [[IDXPROM47_3]]
-; CHECK-NEXT:    [[ARRAYIDX55_3:%.*]] = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 [[IDXPROM47_3]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX55_3]], align 8
-; CHECK-NEXT:    [[INC63_3]] = add nsw i32 [[K_2207_3]], 1
-; CHECK-NEXT:    br i1 true, label [[FOR_INC65_3:%.*]], label [[FOR_BODY42_3]]
-; CHECK:       for.inc65.3:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body42.3
-
-for.body42.3:                                     ; preds = %for.body42.3, %entry
-  %k.2207.3 = phi i32 [ -512, %entry ], [ %inc63.3, %for.body42.3 ]
-  %sub46.3 = add nsw i32 %k.2207.3, 512
-  %idxprom47.3 = zext i32 %sub46.3 to i64
-  %arrayidx48.3 = getelementptr inbounds [2 x [512 x i64]], ptr @a, i64 0, i64 0, i64 %idxprom47.3
-  %arrayidx55.3 = getelementptr inbounds [512 x [4 x i64]], ptr @b, i64 0, i64 %idxprom47.3, i64 3
-  %0 = load i64, ptr %arrayidx55.3, align 8
-  %inc63.3 = add nsw i32 %k.2207.3, 1
-  br i1 undef, label %for.inc65.3, label %for.body42.3
-
-for.inc65.3:                                      ; preds = %for.body42.3
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
-attributes #1 = { nounwind }
diff --git a/llvm/test/Transforms/LoopReroll/basic32iters.ll b/llvm/test/Transforms/LoopReroll/basic32iters.ll
deleted file mode 100644
index edf38cb3eb18..000000000000
--- a/llvm/test/Transforms/LoopReroll/basic32iters.ll
+++ /dev/null
@@ -1,328 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -verify-scev -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; void goo32(float alpha, float *a, float *b) {
-;   for (int i = 0; i < 3200; i += 32) {
-;     a[i] += alpha * b[i];
-;     a[i + 1] += alpha * b[i + 1];
-;     a[i + 2] += alpha * b[i + 2];
-;     a[i + 3] += alpha * b[i + 3];
-;     a[i + 4] += alpha * b[i + 4];
-;     a[i + 5] += alpha * b[i + 5];
-;     a[i + 6] += alpha * b[i + 6];
-;     a[i + 7] += alpha * b[i + 7];
-;     a[i + 8] += alpha * b[i + 8];
-;     a[i + 9] += alpha * b[i + 9];
-;     a[i + 10] += alpha * b[i + 10];
-;     a[i + 11] += alpha * b[i + 11];
-;     a[i + 12] += alpha * b[i + 12];
-;     a[i + 13] += alpha * b[i + 13];
-;     a[i + 14] += alpha * b[i + 14];
-;     a[i + 15] += alpha * b[i + 15];
-;     a[i + 16] += alpha * b[i + 16];
-;     a[i + 17] += alpha * b[i + 17];
-;     a[i + 18] += alpha * b[i + 18];
-;     a[i + 19] += alpha * b[i + 19];
-;     a[i + 20] += alpha * b[i + 20];
-;     a[i + 21] += alpha * b[i + 21];
-;     a[i + 22] += alpha * b[i + 22];
-;     a[i + 23] += alpha * b[i + 23];
-;     a[i + 24] += alpha * b[i + 24];
-;     a[i + 25] += alpha * b[i + 25];
-;     a[i + 26] += alpha * b[i + 26];
-;     a[i + 27] += alpha * b[i + 27];
-;     a[i + 28] += alpha * b[i + 28];
-;     a[i + 29] += alpha * b[i + 29];
-;     a[i + 30] += alpha * b[i + 30];
-;     a[i + 31] += alpha * b[i + 31];
-;   }
-; }
-
-; Function Attrs: norecurse nounwind uwtable
-define void @goo32(float %alpha, ptr %a, ptr readonly %b) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %mul = fmul float %0, %alpha
-  %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %1, %mul
-  store float %add, ptr %arrayidx2, align 4
-  %2 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds float, ptr %b, i64 %2
-  %3 = load float, ptr %arrayidx5, align 4
-  %mul6 = fmul float %3, %alpha
-  %arrayidx9 = getelementptr inbounds float, ptr %a, i64 %2
-  %4 = load float, ptr %arrayidx9, align 4
-  %add10 = fadd float %4, %mul6
-  store float %add10, ptr %arrayidx9, align 4
-  %5 = or disjoint i64 %indvars.iv, 2
-  %arrayidx13 = getelementptr inbounds float, ptr %b, i64 %5
-  %6 = load float, ptr %arrayidx13, align 4
-  %mul14 = fmul float %6, %alpha
-  %arrayidx17 = getelementptr inbounds float, ptr %a, i64 %5
-  %7 = load float, ptr %arrayidx17, align 4
-  %add18 = fadd float %7, %mul14
-  store float %add18, ptr %arrayidx17, align 4
-  %8 = or disjoint i64 %indvars.iv, 3
-  %arrayidx21 = getelementptr inbounds float, ptr %b, i64 %8
-  %9 = load float, ptr %arrayidx21, align 4
-  %mul22 = fmul float %9, %alpha
-  %arrayidx25 = getelementptr inbounds float, ptr %a, i64 %8
-  %10 = load float, ptr %arrayidx25, align 4
-  %add26 = fadd float %10, %mul22
-  store float %add26, ptr %arrayidx25, align 4
-  %11 = or disjoint i64 %indvars.iv, 4
-  %arrayidx29 = getelementptr inbounds float, ptr %b, i64 %11
-  %12 = load float, ptr %arrayidx29, align 4
-  %mul30 = fmul float %12, %alpha
-  %arrayidx33 = getelementptr inbounds float, ptr %a, i64 %11
-  %13 = load float, ptr %arrayidx33, align 4
-  %add34 = fadd float %13, %mul30
-  store float %add34, ptr %arrayidx33, align 4
-  %14 = or disjoint i64 %indvars.iv, 5
-  %arrayidx37 = getelementptr inbounds float, ptr %b, i64 %14
-  %15 = load float, ptr %arrayidx37, align 4
-  %mul38 = fmul float %15, %alpha
-  %arrayidx41 = getelementptr inbounds float, ptr %a, i64 %14
-  %16 = load float, ptr %arrayidx41, align 4
-  %add42 = fadd float %16, %mul38
-  store float %add42, ptr %arrayidx41, align 4
-  %17 = or disjoint i64 %indvars.iv, 6
-  %arrayidx45 = getelementptr inbounds float, ptr %b, i64 %17
-  %18 = load float, ptr %arrayidx45, align 4
-  %mul46 = fmul float %18, %alpha
-  %arrayidx49 = getelementptr inbounds float, ptr %a, i64 %17
-  %19 = load float, ptr %arrayidx49, align 4
-  %add50 = fadd float %19, %mul46
-  store float %add50, ptr %arrayidx49, align 4
-  %20 = or disjoint i64 %indvars.iv, 7
-  %arrayidx53 = getelementptr inbounds float, ptr %b, i64 %20
-  %21 = load float, ptr %arrayidx53, align 4
-  %mul54 = fmul float %21, %alpha
-  %arrayidx57 = getelementptr inbounds float, ptr %a, i64 %20
-  %22 = load float, ptr %arrayidx57, align 4
-  %add58 = fadd float %22, %mul54
-  store float %add58, ptr %arrayidx57, align 4
-  %23 = or disjoint i64 %indvars.iv, 8
-  %arrayidx61 = getelementptr inbounds float, ptr %b, i64 %23
-  %24 = load float, ptr %arrayidx61, align 4
-  %mul62 = fmul float %24, %alpha
-  %arrayidx65 = getelementptr inbounds float, ptr %a, i64 %23
-  %25 = load float, ptr %arrayidx65, align 4
-  %add66 = fadd float %25, %mul62
-  store float %add66, ptr %arrayidx65, align 4
-  %26 = or disjoint i64 %indvars.iv, 9
-  %arrayidx69 = getelementptr inbounds float, ptr %b, i64 %26
-  %27 = load float, ptr %arrayidx69, align 4
-  %mul70 = fmul float %27, %alpha
-  %arrayidx73 = getelementptr inbounds float, ptr %a, i64 %26
-  %28 = load float, ptr %arrayidx73, align 4
-  %add74 = fadd float %28, %mul70
-  store float %add74, ptr %arrayidx73, align 4
-  %29 = or disjoint i64 %indvars.iv, 10
-  %arrayidx77 = getelementptr inbounds float, ptr %b, i64 %29
-  %30 = load float, ptr %arrayidx77, align 4
-  %mul78 = fmul float %30, %alpha
-  %arrayidx81 = getelementptr inbounds float, ptr %a, i64 %29
-  %31 = load float, ptr %arrayidx81, align 4
-  %add82 = fadd float %31, %mul78
-  store float %add82, ptr %arrayidx81, align 4
-  %32 = or disjoint i64 %indvars.iv, 11
-  %arrayidx85 = getelementptr inbounds float, ptr %b, i64 %32
-  %33 = load float, ptr %arrayidx85, align 4
-  %mul86 = fmul float %33, %alpha
-  %arrayidx89 = getelementptr inbounds float, ptr %a, i64 %32
-  %34 = load float, ptr %arrayidx89, align 4
-  %add90 = fadd float %34, %mul86
-  store float %add90, ptr %arrayidx89, align 4
-  %35 = or disjoint i64 %indvars.iv, 12
-  %arrayidx93 = getelementptr inbounds float, ptr %b, i64 %35
-  %36 = load float, ptr %arrayidx93, align 4
-  %mul94 = fmul float %36, %alpha
-  %arrayidx97 = getelementptr inbounds float, ptr %a, i64 %35
-  %37 = load float, ptr %arrayidx97, align 4
-  %add98 = fadd float %37, %mul94
-  store float %add98, ptr %arrayidx97, align 4
-  %38 = or disjoint i64 %indvars.iv, 13
-  %arrayidx101 = getelementptr inbounds float, ptr %b, i64 %38
-  %39 = load float, ptr %arrayidx101, align 4
-  %mul102 = fmul float %39, %alpha
-  %arrayidx105 = getelementptr inbounds float, ptr %a, i64 %38
-  %40 = load float, ptr %arrayidx105, align 4
-  %add106 = fadd float %40, %mul102
-  store float %add106, ptr %arrayidx105, align 4
-  %41 = or disjoint i64 %indvars.iv, 14
-  %arrayidx109 = getelementptr inbounds float, ptr %b, i64 %41
-  %42 = load float, ptr %arrayidx109, align 4
-  %mul110 = fmul float %42, %alpha
-  %arrayidx113 = getelementptr inbounds float, ptr %a, i64 %41
-  %43 = load float, ptr %arrayidx113, align 4
-  %add114 = fadd float %43, %mul110
-  store float %add114, ptr %arrayidx113, align 4
-  %44 = or disjoint i64 %indvars.iv, 15
-  %arrayidx117 = getelementptr inbounds float, ptr %b, i64 %44
-  %45 = load float, ptr %arrayidx117, align 4
-  %mul118 = fmul float %45, %alpha
-  %arrayidx121 = getelementptr inbounds float, ptr %a, i64 %44
-  %46 = load float, ptr %arrayidx121, align 4
-  %add122 = fadd float %46, %mul118
-  store float %add122, ptr %arrayidx121, align 4
-  %47 = or disjoint i64 %indvars.iv, 16
-  %arrayidx125 = getelementptr inbounds float, ptr %b, i64 %47
-  %48 = load float, ptr %arrayidx125, align 4
-  %mul126 = fmul float %48, %alpha
-  %arrayidx129 = getelementptr inbounds float, ptr %a, i64 %47
-  %49 = load float, ptr %arrayidx129, align 4
-  %add130 = fadd float %49, %mul126
-  store float %add130, ptr %arrayidx129, align 4
-  %50 = or disjoint i64 %indvars.iv, 17
-  %arrayidx133 = getelementptr inbounds float, ptr %b, i64 %50
-  %51 = load float, ptr %arrayidx133, align 4
-  %mul134 = fmul float %51, %alpha
-  %arrayidx137 = getelementptr inbounds float, ptr %a, i64 %50
-  %52 = load float, ptr %arrayidx137, align 4
-  %add138 = fadd float %52, %mul134
-  store float %add138, ptr %arrayidx137, align 4
-  %53 = or disjoint i64 %indvars.iv, 18
-  %arrayidx141 = getelementptr inbounds float, ptr %b, i64 %53
-  %54 = load float, ptr %arrayidx141, align 4
-  %mul142 = fmul float %54, %alpha
-  %arrayidx145 = getelementptr inbounds float, ptr %a, i64 %53
-  %55 = load float, ptr %arrayidx145, align 4
-  %add146 = fadd float %55, %mul142
-  store float %add146, ptr %arrayidx145, align 4
-  %56 = or disjoint i64 %indvars.iv, 19
-  %arrayidx149 = getelementptr inbounds float, ptr %b, i64 %56
-  %57 = load float, ptr %arrayidx149, align 4
-  %mul150 = fmul float %57, %alpha
-  %arrayidx153 = getelementptr inbounds float, ptr %a, i64 %56
-  %58 = load float, ptr %arrayidx153, align 4
-  %add154 = fadd float %58, %mul150
-  store float %add154, ptr %arrayidx153, align 4
-  %59 = or disjoint i64 %indvars.iv, 20
-  %arrayidx157 = getelementptr inbounds float, ptr %b, i64 %59
-  %60 = load float, ptr %arrayidx157, align 4
-  %mul158 = fmul float %60, %alpha
-  %arrayidx161 = getelementptr inbounds float, ptr %a, i64 %59
-  %61 = load float, ptr %arrayidx161, align 4
-  %add162 = fadd float %61, %mul158
-  store float %add162, ptr %arrayidx161, align 4
-  %62 = or disjoint i64 %indvars.iv, 21
-  %arrayidx165 = getelementptr inbounds float, ptr %b, i64 %62
-  %63 = load float, ptr %arrayidx165, align 4
-  %mul166 = fmul float %63, %alpha
-  %arrayidx169 = getelementptr inbounds float, ptr %a, i64 %62
-  %64 = load float, ptr %arrayidx169, align 4
-  %add170 = fadd float %64, %mul166
-  store float %add170, ptr %arrayidx169, align 4
-  %65 = or disjoint i64 %indvars.iv, 22
-  %arrayidx173 = getelementptr inbounds float, ptr %b, i64 %65
-  %66 = load float, ptr %arrayidx173, align 4
-  %mul174 = fmul float %66, %alpha
-  %arrayidx177 = getelementptr inbounds float, ptr %a, i64 %65
-  %67 = load float, ptr %arrayidx177, align 4
-  %add178 = fadd float %67, %mul174
-  store float %add178, ptr %arrayidx177, align 4
-  %68 = or disjoint i64 %indvars.iv, 23
-  %arrayidx181 = getelementptr inbounds float, ptr %b, i64 %68
-  %69 = load float, ptr %arrayidx181, align 4
-  %mul182 = fmul float %69, %alpha
-  %arrayidx185 = getelementptr inbounds float, ptr %a, i64 %68
-  %70 = load float, ptr %arrayidx185, align 4
-  %add186 = fadd float %70, %mul182
-  store float %add186, ptr %arrayidx185, align 4
-  %71 = or disjoint i64 %indvars.iv, 24
-  %arrayidx189 = getelementptr inbounds float, ptr %b, i64 %71
-  %72 = load float, ptr %arrayidx189, align 4
-  %mul190 = fmul float %72, %alpha
-  %arrayidx193 = getelementptr inbounds float, ptr %a, i64 %71
-  %73 = load float, ptr %arrayidx193, align 4
-  %add194 = fadd float %73, %mul190
-  store float %add194, ptr %arrayidx193, align 4
-  %74 = or disjoint i64 %indvars.iv, 25
-  %arrayidx197 = getelementptr inbounds float, ptr %b, i64 %74
-  %75 = load float, ptr %arrayidx197, align 4
-  %mul198 = fmul float %75, %alpha
-  %arrayidx201 = getelementptr inbounds float, ptr %a, i64 %74
-  %76 = load float, ptr %arrayidx201, align 4
-  %add202 = fadd float %76, %mul198
-  store float %add202, ptr %arrayidx201, align 4
-  %77 = or disjoint i64 %indvars.iv, 26
-  %arrayidx205 = getelementptr inbounds float, ptr %b, i64 %77
-  %78 = load float, ptr %arrayidx205, align 4
-  %mul206 = fmul float %78, %alpha
-  %arrayidx209 = getelementptr inbounds float, ptr %a, i64 %77
-  %79 = load float, ptr %arrayidx209, align 4
-  %add210 = fadd float %79, %mul206
-  store float %add210, ptr %arrayidx209, align 4
-  %80 = or disjoint i64 %indvars.iv, 27
-  %arrayidx213 = getelementptr inbounds float, ptr %b, i64 %80
-  %81 = load float, ptr %arrayidx213, align 4
-  %mul214 = fmul float %81, %alpha
-  %arrayidx217 = getelementptr inbounds float, ptr %a, i64 %80
-  %82 = load float, ptr %arrayidx217, align 4
-  %add218 = fadd float %82, %mul214
-  store float %add218, ptr %arrayidx217, align 4
-  %83 = or disjoint i64 %indvars.iv, 28
-  %arrayidx221 = getelementptr inbounds float, ptr %b, i64 %83
-  %84 = load float, ptr %arrayidx221, align 4
-  %mul222 = fmul float %84, %alpha
-  %arrayidx225 = getelementptr inbounds float, ptr %a, i64 %83
-  %85 = load float, ptr %arrayidx225, align 4
-  %add226 = fadd float %85, %mul222
-  store float %add226, ptr %arrayidx225, align 4
-  %86 = or disjoint i64 %indvars.iv, 29
-  %arrayidx229 = getelementptr inbounds float, ptr %b, i64 %86
-  %87 = load float, ptr %arrayidx229, align 4
-  %mul230 = fmul float %87, %alpha
-  %arrayidx233 = getelementptr inbounds float, ptr %a, i64 %86
-  %88 = load float, ptr %arrayidx233, align 4
-  %add234 = fadd float %88, %mul230
-  store float %add234, ptr %arrayidx233, align 4
-  %89 = or disjoint i64 %indvars.iv, 30
-  %arrayidx237 = getelementptr inbounds float, ptr %b, i64 %89
-  %90 = load float, ptr %arrayidx237, align 4
-  %mul238 = fmul float %90, %alpha
-  %arrayidx241 = getelementptr inbounds float, ptr %a, i64 %89
-  %91 = load float, ptr %arrayidx241, align 4
-  %add242 = fadd float %91, %mul238
-  store float %add242, ptr %arrayidx241, align 4
-  %92 = or disjoint i64 %indvars.iv, 31
-  %arrayidx245 = getelementptr inbounds float, ptr %b, i64 %92
-  %93 = load float, ptr %arrayidx245, align 4
-  %mul246 = fmul float %93, %alpha
-  %arrayidx249 = getelementptr inbounds float, ptr %a, i64 %92
-  %94 = load float, ptr %arrayidx249, align 4
-  %add250 = fadd float %94, %mul246
-  store float %add250, ptr %arrayidx249, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
-  %cmp = icmp slt i64 %indvars.iv.next, 3200
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @goo32
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvar
-; CHECK: %0 = load float, ptr %arrayidx, align 4
-; CHECK: %mul = fmul float %0, %alpha
-; CHECK: %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvar
-; CHECK: %1 = load float, ptr %arrayidx2, align 4
-; CHECK: %add = fadd float %1, %mul
-; CHECK: store float %add, ptr %arrayidx2, align 4
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i64 %indvar, 3199
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/llvm/test/Transforms/LoopReroll/complex_reroll.ll b/llvm/test/Transforms/LoopReroll/complex_reroll.ll
deleted file mode 100644
index 27139eeecf8c..000000000000
--- a/llvm/test/Transforms/LoopReroll/complex_reroll.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-declare i32 @goo(i32, i32)
-
-@buf = external global ptr
-@aaa = global [16 x i8] c"\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10", align 1
-
-define i32 @test1(i32 %len) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[ADD]] = add i64 [[CONV]], [[SUM44_020]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]])
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %while.body
-
-while.body:
-
-  %dec22 = phi i32 [ 4, %entry ], [ %dec, %while.body ]
-  %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ]
-  %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ]
-  %0 = load i8, ptr %buf.021, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum44.020
-  %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1
-  %1 = load i8, ptr %arrayidx1, align 1
-  %conv2 = zext i8 %1 to i64
-  %add3 = add i64 %add, %conv2
-  %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2
-  %2 = load i8, ptr %arrayidx4, align 1
-  %conv5 = zext i8 %2 to i64
-  %add6 = add i64 %add3, %conv5
-  %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3
-  %3 = load i8, ptr %arrayidx7, align 1
-  %conv8 = zext i8 %3 to i64
-  %add9 = add i64 %add6, %conv8
-  %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4
-  %dec = add nsw i32 %dec22, -1
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body
-  %conv11 = trunc i64 %add9 to i32
-  %call = tail call i32 @goo(i32 0, i32 %conv11)
-  unreachable
-}
-
-define i32 @test2(i32 %N, ptr nocapture readonly %a, i32 %S) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.for.cond.cleanup_crit_edge:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[S_ADDR_0_LCSSA:%.*]] = phi i32 [ [[ADD2_LCSSA]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]] ], [ [[S:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i32 [[S_ADDR_0_LCSSA]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT:    [[S_ADDR_011:%.*]] = phi i32 [ [[S]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[INDVAR]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_ADDR_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label [[FOR_BODY]]
-;
-entry:
-  %cmp.9 = icmp sgt i32 %N, 0
-  br i1 %cmp.9, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:
-  br label %for.body
-
-for.cond.for.cond.cleanup_crit_edge:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %S.addr.0.lcssa = phi i32 [ %add2, %for.cond.for.cond.cleanup_crit_edge ], [ %S, %entry ]
-  ret i32 %S.addr.0.lcssa
-
-for.body:
-
-  %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ]
-  %S.addr.011 = phi i32 [ %S, %for.body.lr.ph ], [ %add2, %for.body ]
-  %a.addr.010 = phi ptr [ %a, %for.body.lr.ph ], [ %incdec.ptr1, %for.body ]
-  %incdec.ptr = getelementptr inbounds i32, ptr %a.addr.010, i64 1
-  %0 = load i32, ptr %a.addr.010, align 4
-  %add = add nsw i32 %0, %S.addr.011
-  %incdec.ptr1 = getelementptr inbounds i32, ptr %a.addr.010, i64 2
-  %1 = load i32, ptr %incdec.ptr, align 4
-  %add2 = add nsw i32 %add, %1
-  %add3 = add nsw i32 %i.012, 2
-  %cmp = icmp slt i32 %add3, %N
-  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
-}
-
-define i32 @test3(ptr nocapture readonly %buf, i32 %len) #0 {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[LEN:%.*]], 1
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], -2
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_012:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_012]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp10 = icmp sgt i32 %len, 1
-  br i1 %cmp10, label %while.body.preheader, label %while.end
-
-while.body.preheader:                             ; preds = %entry
-  br label %while.body
-
-while.body:                                       ; preds = %while.body.preheader, %while.body
-
-  %i.013 = phi i32 [ %sub, %while.body ], [ %len, %while.body.preheader ]
-  %S.012 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.011 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.011, align 4
-  %add = add nsw i32 %0, %S.012
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.011, i64 -1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.011, i64 -2
-  %sub = add nsw i32 %i.013, -2
-  %cmp = icmp sgt i32 %sub, 1
-  br i1 %cmp, label %while.body, label %while.end.loopexit
-
-while.end.loopexit:                               ; preds = %while.body
-  br label %while.end
-
-while.end:                                        ; preds = %while.end.loopexit, %entry
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
-
-define i32 @test4(i32 %len) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM44_020:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr @aaa, i64 [[INDVAR]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SCEVGEP]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[ADD]] = add i64 [[CONV]], [[SUM44_020]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP0]], 23
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[ADD9_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD9_LCSSA]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @goo(i32 0, i32 [[CONV11]])
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %while.body
-
-while.body:
-  %a = phi i32 [ 4, %entry ], [ %a.next, %while.body ]
-  %b = phi i32 [ 6, %entry ], [ %b.next, %while.body ]
-  %buf.021 = phi ptr [ @aaa, %entry ], [ %add.ptr, %while.body ]
-  %sum44.020 = phi i64 [ 0, %entry ], [ %add9, %while.body ]
-  %0 = load i8, ptr %buf.021, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum44.020
-  %arrayidx1 = getelementptr inbounds i8, ptr %buf.021, i64 1
-  %1 = load i8, ptr %arrayidx1, align 1
-  %conv2 = zext i8 %1 to i64
-  %add3 = add i64 %add, %conv2
-  %arrayidx4 = getelementptr inbounds i8, ptr %buf.021, i64 2
-  %2 = load i8, ptr %arrayidx4, align 1
-  %conv5 = zext i8 %2 to i64
-  %add6 = add i64 %add3, %conv5
-  %arrayidx7 = getelementptr inbounds i8, ptr %buf.021, i64 3
-  %3 = load i8, ptr %arrayidx7, align 1
-  %conv8 = zext i8 %3 to i64
-  %add9 = add i64 %add6, %conv8
-  %add.ptr = getelementptr inbounds i8, ptr %buf.021, i64 4
-  %a.next = add nsw i32 %a, -1
-  %b.next = add nsw i32 %b, -1
-  %cond = add nsw i32 %a, %b
-  %tobool = icmp eq i32 %cond, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body
-  %conv11 = trunc i64 %add9 to i32
-  %call = tail call i32 @goo(i32 0, i32 %conv11)
-  unreachable
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/external_use.ll b/llvm/test/Transforms/LoopReroll/external_use.ll
deleted file mode 100644
index 2124f3b15cd2..000000000000
--- a/llvm/test/Transforms/LoopReroll/external_use.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-
-; Check whether rerolling is rejected if values of the base and root
-; instruction are used outside the loop block.
-
-; Only the base/root instructions except a loop increment instruction
-define void @test1() {
-entry:
-  br label %loop1
-
-loop1:
-;CHECK-LABEL: loop1:
-;CHECK-NEXT:   %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ]
-;CHECK-NEXT:   %indvar.1 = add nsw i64 %indvar, 1
-
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop1 ]
-  %indvar.1 = add nsw i64 %indvar, 1
-  %indvar.next = add nsw i64 %indvar, 2
-  %cmp = icmp slt i64 %indvar.next, 200
-  br i1 %cmp, label %loop1, label %exit
-
-exit:
-  %var1 = phi i64 [ %indvar.1, %loop1 ]
-  %var2 = phi i64 [ %indvar, %loop1 ]
-  ret void
-}
-
-; Both the base/root instructions and reduction instructions
-define void @test2() {
-entry:
-  br label %loop2
-
-loop2:
-;CHECK-LABEL: loop2:
-;CHECK-NEXT:   %indvar = phi i32  [ 0, %entry ], [ %indvar.next, %loop2 ]
-;CHECK-NEXT:   %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ]
-;CHECK-NEXT:   %indvar.1 = add nuw nsw i32 %indvar, 1
-;CHECK-NEXT:   %indvar.2 = add nuw nsw i32 %indvar, 2
-
-  %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %loop2 ]
-  %redvar = phi i32 [ 0, %entry ], [ %add.2, %loop2 ]
-  %indvar.1 = add nuw nsw i32 %indvar, 1
-  %indvar.2 = add nuw nsw i32 %indvar, 2
-  %mul.0 = mul nsw i32 %indvar, %indvar
-  %mul.1 = mul nsw i32 %indvar.1, %indvar.1
-  %mul.2 = mul nsw i32 %indvar.2, %indvar.2
-  %add.0 = add nsw i32 %redvar, %mul.0
-  %add.1 = add nsw i32 %add.0, %mul.1
-  %add.2 = add nsw i32 %add.1, %mul.2
-  %indvar.next = add nuw nsw i32 %indvar, 3
-  %cmp = icmp slt i32 %indvar.next, 300
-  br i1 %cmp, label %loop2, label %exit
-
-exit:
-  %a = phi i32 [ %indvar, %loop2 ]
-  %b = phi i32 [ %indvar.1, %loop2 ]
-  %c = phi i32 [ %indvar.2, %loop2 ]
-  %x = phi i32 [ %add.2, %loop2 ]
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopReroll/extra_instr.ll b/llvm/test/Transforms/LoopReroll/extra_instr.ll
deleted file mode 100644
index 3114463a3e04..000000000000
--- a/llvm/test/Transforms/LoopReroll/extra_instr.ll
+++ /dev/null
@@ -1,361 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define void @rerollable1(ptr nocapture %a) {
-; CHECK-LABEL: define void @rerollable1
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 160
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 80
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[SCEVGEP1]], align 4
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND2:%.*]] = icmp eq i64 [[IV]], 9
-; CHECK-NEXT:    br i1 [[EXITCOND2]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; NO unrerollable instructions
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @unrerollable1(ptr nocapture %a) {
-; CHECK-LABEL: define void @unrerollable1
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV]], i64 0
-; CHECK-NEXT:    store i32 999, ptr [[STPTRX]], align 4
-; CHECK-NEXT:    [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20
-; CHECK-NEXT:    [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10
-; CHECK-NEXT:    [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4
-; CHECK-NEXT:    [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[STPTR0]], align 4
-; CHECK-NEXT:    [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1
-; CHECK-NEXT:    [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4
-; CHECK-NEXT:    [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1
-; CHECK-NEXT:    store i32 [[VALUE1]], ptr [[STPTR1]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; unrerollable instructions using %iv
-  %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv, i64 0
-  store i32 999, ptr %stptrx, align 4
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @unrerollable2(ptr nocapture %a) {
-; CHECK-LABEL: define void @unrerollable2
-; CHECK-SAME: (ptr nocapture [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[STPTRX:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[IV_NEXT]], i64 0
-; CHECK-NEXT:    store i32 999, ptr [[STPTRX]], align 4
-; CHECK-NEXT:    [[PLUS20:%.*]] = add nuw nsw i64 [[IV]], 20
-; CHECK-NEXT:    [[PLUS10:%.*]] = add nuw nsw i64 [[IV]], 10
-; CHECK-NEXT:    [[LDPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 0
-; CHECK-NEXT:    [[VALUE0:%.*]] = load i32, ptr [[LDPTR0]], align 4
-; CHECK-NEXT:    [[STPTR0:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 0
-; CHECK-NEXT:    store i32 [[VALUE0]], ptr [[STPTR0]], align 4
-; CHECK-NEXT:    [[LDPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS20]], i64 1
-; CHECK-NEXT:    [[VALUE1:%.*]] = load i32, ptr [[LDPTR1]], align 4
-; CHECK-NEXT:    [[STPTR1:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i64 [[PLUS10]], i64 1
-; CHECK-NEXT:    store i32 [[VALUE1]], ptr [[STPTR1]], align 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; base instruction
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; loop-increment
-  %iv.next = add nuw nsw i64 %iv, 1
-
-  ; unrerollable instructions using %iv.next
-  %stptrx = getelementptr inbounds [2 x i32], ptr %a, i64 %iv.next, i64 0
-  store i32 999, ptr %stptrx, align 4
-
-  ; extra simple arithmetic operations, used by root instructions
-  %plus20 = add nuw nsw i64 %iv, 20
-  %plus10 = add nuw nsw i64 %iv, 10
-
-  ; root instruction 0
-  %ldptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 0
-  %value0 = load i32, ptr %ldptr0, align 4
-  %stptr0 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 0
-  store i32 %value0, ptr %stptr0, align 4
-
-  ; root instruction 1
-  %ldptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus20, i64 1
-  %value1 = load i32, ptr %ldptr1, align 4
-  %stptr1 = getelementptr inbounds [2 x i32], ptr %a, i64 %plus10, i64 1
-  store i32 %value1, ptr %stptr1, align 4
-
-  ; latch
-  %exitcond = icmp eq i64 %iv.next, 5
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define dso_local void @rerollable2() {
-; CHECK-LABEL: define dso_local void @rerollable2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV]], 24
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV]], 20
-; CHECK-NEXT:    [[IV_SCALED_DIV5:%.*]] = udiv i32 [[TMP1]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[TMP0]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; induction variable
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; scale instruction
-  %iv.mul3 = mul nuw nsw i32 %iv, 3
-
-  ; extra simple arithmetic operations, used by root instructions
-  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
-
-  ; NO unrerollable instructions
-
-  ; root set 1
-
-  ; base instruction
-  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
-  tail call void @bar(i32 %iv.scaled.div5)
-  ; root instruction 0
-  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
-  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
-  tail call void @bar(i32 %iv.scaled.add1.div5)
-  ; root instruction 2
-  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
-  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
-  tail call void @bar(i32 %iv.scaled.add2.div5)
-
-  ; root set 2
-
-  ; base instruction
-  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
-  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
-  tail call void @bar(i32 %iv.scaled.add4.div5)
-  ; root instruction 0
-  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
-  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
-  tail call void @bar(i32 %iv.scaled.add5.div5)
-  ; root instruction 2
-  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
-  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
-  tail call void @bar(i32 %iv.scaled.add6.div5)
-
-  ; loop-increment
-  %iv.next = add nuw nsw i32 %iv, 1
-
-  ; latch
-  %cmp = icmp ult i32 %iv.next, 3
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-define dso_local void @unrerollable3() {
-; CHECK-LABEL: define dso_local void @unrerollable3() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_MUL3:%.*]] = mul nuw nsw i32 [[IV]], 3
-; CHECK-NEXT:    [[IV_SCALED:%.*]] = add nuw nsw i32 [[IV_MUL3]], 20
-; CHECK-NEXT:    [[IV_MUL7:%.*]] = mul nuw nsw i32 [[IV]], 7
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_MUL7]])
-; CHECK-NEXT:    [[IV_SCALED_DIV5:%.*]] = udiv i32 [[IV_SCALED]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD1:%.*]] = add nuw nsw i32 [[IV_SCALED]], 1
-; CHECK-NEXT:    [[IV_SCALED_ADD1_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD1]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD1_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD2:%.*]] = add nuw nsw i32 [[IV_SCALED]], 2
-; CHECK-NEXT:    [[IV_SCALED_ADD2_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD2]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD2_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD4:%.*]] = add nuw nsw i32 [[IV_SCALED]], 4
-; CHECK-NEXT:    [[IV_SCALED_ADD4_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD4]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD4_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD5:%.*]] = add nuw nsw i32 [[IV_SCALED]], 5
-; CHECK-NEXT:    [[IV_SCALED_ADD5_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD5]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD5_DIV5]])
-; CHECK-NEXT:    [[IV_SCALED_ADD6:%.*]] = add nuw nsw i32 [[IV_SCALED]], 6
-; CHECK-NEXT:    [[IV_SCALED_ADD6_DIV5:%.*]] = udiv i32 [[IV_SCALED_ADD6]], 5
-; CHECK-NEXT:    tail call void @bar(i32 [[IV_SCALED_ADD6_DIV5]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-
-loop:
-
-
-  ; induction variable
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-
-  ; scale instruction
-  %iv.mul3 = mul nuw nsw i32 %iv, 3
-
-  ; extra simple arithmetic operations, used by root instructions
-  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
-
-  ; unrerollable instructions using %iv
-  %iv.mul7 = mul nuw nsw i32 %iv, 7
-  tail call void @bar(i32 %iv.mul7)
-
-  ; root set 1
-
-  ; base instruction
-  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
-  tail call void @bar(i32 %iv.scaled.div5)
-  ; root instruction 0
-  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
-  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
-  tail call void @bar(i32 %iv.scaled.add1.div5)
-  ; root instruction 2
-  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
-  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
-  tail call void @bar(i32 %iv.scaled.add2.div5)
-
-  ; root set 2
-
-  ; base instruction
-  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
-  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
-  tail call void @bar(i32 %iv.scaled.add4.div5)
-  ; root instruction 0
-  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
-  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
-  tail call void @bar(i32 %iv.scaled.add5.div5)
-  ; root instruction 2
-  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
-  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
-  tail call void @bar(i32 %iv.scaled.add6.div5)
-
-  ; loop-increment
-  %iv.next = add nuw nsw i32 %iv, 1
-
-  ; latch
-  %cmp = icmp ult i32 %iv.next, 3
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-declare dso_local void @bar(i32)
diff --git a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll b/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll
deleted file mode 100644
index 3fcd43f1866a..000000000000
--- a/llvm/test/Transforms/LoopReroll/indvar_with_ext.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define void @test(i32 %n, ptr %arrayidx200, ptr %arrayidx164, ptr %arrayidx172) {
-entry:
-  %rem.i = srem i32 %n, 4
-  %t22 = load float, ptr %arrayidx172, align 4
-  %cmp.9 = icmp eq i32 %n, 0
-  %t7 = sext i32 %n to i64
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-;CHECK-LABEL: while.body:
-;CHECK-NEXT:    %indvar = phi i64 [ %indvar.next, %while.body ], [ 0, %while.body.preheader ]
-;CHECK-NEXT:    %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvar
-;CHECK-NEXT:    %t1 = load float, ptr %arrayidx62.i, align 4
-;CHECK-NEXT:    %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvar
-;CHECK-NEXT:    %t2 = load float, ptr %arrayidx64.i, align 4
-;CHECK-NEXT:    %mul65.i = fmul fast float %t2, %t22
-;CHECK-NEXT:    %add66.i = fadd fast float %mul65.i, %t1
-;CHECK-NEXT:    store float %add66.i, ptr %arrayidx62.i, align 4
-;CHECK-NEXT:    %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:    %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:    br i1 %exitcond, label %while.end.loopexit, label %while.body
-
-  %indvars.iv.i423 = phi i64 [ %indvars.iv.next.i424, %while.body ], [ 0, %while.body.preheader ]
-  %i.22.i = phi i32 [ %add103.i, %while.body ], [ %rem.i, %while.body.preheader ]
-  %arrayidx62.i = getelementptr inbounds float, ptr %arrayidx200, i64 %indvars.iv.i423
-  %t1 = load float, ptr %arrayidx62.i, align 4
-  %arrayidx64.i = getelementptr inbounds float, ptr %arrayidx164, i64 %indvars.iv.i423
-  %t2 = load float, ptr %arrayidx64.i, align 4
-  %mul65.i = fmul fast float %t2, %t22
-  %add66.i = fadd fast float %mul65.i, %t1
-  store float %add66.i, ptr %arrayidx62.i, align 4
-  %t3 = add nsw i64 %indvars.iv.i423, 1
-  %arrayidx71.i = getelementptr inbounds float, ptr %arrayidx200, i64 %t3
-  %t4 = load float, ptr %arrayidx71.i, align 4
-  %arrayidx74.i = getelementptr inbounds float, ptr %arrayidx164, i64 %t3
-  %t5 = load float, ptr %arrayidx74.i, align 4
-  %mul75.i = fmul fast float %t5, %t22
-  %add76.i = fadd fast float %mul75.i, %t4
-  store float %add76.i, ptr %arrayidx71.i, align 4
-  %add103.i = add nsw i32 %i.22.i, 2
-  %t6 = sext i32 %add103.i to i64
-  %cmp58.i = icmp slt i64 %t6, %t7
-  %indvars.iv.next.i424 = add i64 %indvars.iv.i423, 2
-  br i1 %cmp58.i, label %while.body, label %while.end.loopexit
-
-while.end.loopexit:
-  br label %while.end
-
-while.end:
-  ret void
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test2(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp18 = icmp sgt i64 %n, 0
-  br i1 %cmp18, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:     for.body:
-;CHECK-NEXT:  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ]
-;CHECK-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar
-;CHECK-NEXT:  [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4
-;CHECK-NEXT:  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar
-;CHECK-NEXT:  store i32 [[T1]], ptr %arrayidx3, align 4
-;CHECK-NEXT:  %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:  %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %0, ptr %arrayidx3, align 4
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1
-  %2 = load i32, ptr %arrayidx5, align 4
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %2, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test3(i32 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp21 = icmp sgt i32 %n, 0
-  br i1 %cmp21, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:      for.body:
-;CHECK:        %add12 = add i8 %i.022, 2
-;CHECK-NEXT:   %conv = sext i8 %add12 to i32
-;CHECK-NEXT:   %cmp = icmp slt i32 %conv, %n
-;CHECK-NEXT:   br i1 %cmp, label %for.body, label %for.end.loopexit
-
-  %conv23 = phi i32 [ %conv, %for.body ], [ 0, %for.body.preheader ]
-  %i.022 = phi i8 [ %add12, %for.body ], [ 0, %for.body.preheader ]
-  %idxprom = sext i8 %i.022 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %idxprom
-  store i32 %0, ptr %arrayidx3, align 4
-  %add = or disjoint i32 %conv23, 1
-  %idxprom5 = sext i32 %add to i64
-  %arrayidx6 = getelementptr inbounds i32, ptr %y, i64 %idxprom5
-  %1 = load i32, ptr %arrayidx6, align 4
-  %arrayidx10 = getelementptr inbounds i32, ptr %x, i64 %idxprom5
-  store i32 %1, ptr %arrayidx10, align 4
-  %add12 = add i8 %i.022, 2
-  %conv = sext i8 %add12 to i32
-  %cmp = icmp slt i32 %conv, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
-; Function Attrs: noinline norecurse nounwind
-define i32 @test4(i64 %n, ptr nocapture %x, ptr nocapture readonly %y) {
-entry:
-  %cmp18 = icmp eq i64 %n, 0
-  br i1 %cmp18, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-
-;CHECK-LABEL:     for.body:
-;CHECK-NEXT:  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ]
-;CHECK-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvar
-;CHECK-NEXT:  [[T1:%[0-9]+]] = load i32, ptr %arrayidx, align 4
-;CHECK-NEXT:  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvar
-;CHECK-NEXT:  store i32 [[T1]], ptr %arrayidx3, align 4
-;CHECK-NEXT:  %indvar.next = add i64 %indvar, 1
-;CHECK-NEXT:  %exitcond = icmp eq i64 %indvar, %{{[0-9]+}}
-;CHECK-NEXT:  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  store i32 %0, ptr %arrayidx3, align 4
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, ptr %y, i64 %1
-  %2 = load i32, ptr %arrayidx5, align 4
-  %arrayidx8 = getelementptr inbounds i32, ptr %x, i64 %1
-  store i32 %2, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp ult i64 %indvars.iv.next, %n
-  br i1 %cmp, label %for.body, label %for.end.loopexit
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32 0
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/negative.ll b/llvm/test/Transforms/LoopReroll/negative.ll
deleted file mode 100644
index ef850c0e23cf..000000000000
--- a/llvm/test/Transforms/LoopReroll/negative.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-@buf = global [16 x i8] c"\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A\0A", align 1
-
-define i32 @test1(i32 %len, ptr nocapture readonly %buf) #0 {
-entry:
-  %cmp.13 = icmp sgt i32 %len, 1
-  br i1 %cmp.13, label %while.body.lr.ph, label %while.end
-
-while.body.lr.ph:                                 ; preds = %entry
-  br label %while.body
-
-while.body:
-;CHECK-LABEL: while.body:
-;CHECK-NEXT:    %indvar = phi i32 [ %indvar.next, %while.body ], [ 0, %while.body.lr.ph ]
-;CHECK-NEXT:    %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
-;CHECK-NOT:     %sub5 = add nsw i32 %len.addr.014, -1
-;CHECK-NOT:     %sub5 = add nsw i32 %len.addr.014, -2
-;CHECK:    br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body
-
-  %sum4.015 = phi i64 [ 0, %while.body.lr.ph ], [ %add4, %while.body ]
-  %len.addr.014 = phi i32 [ %len, %while.body.lr.ph ], [ %sub5, %while.body ]
-  %idxprom = sext i32 %len.addr.014 to i64
-  %arrayidx = getelementptr inbounds i8, ptr %buf, i64 %idxprom
-  %0 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %0 to i64
-  %add = add i64 %conv, %sum4.015
-  %sub = add nsw i32 %len.addr.014, -1
-  %idxprom1 = sext i32 %sub to i64
-  %arrayidx2 = getelementptr inbounds i8, ptr %buf, i64 %idxprom1
-  %1 = load i8, ptr %arrayidx2, align 1
-  %conv3 = zext i8 %1 to i64
-  %add4 = add i64 %add, %conv3
-  %sub5 = add nsw i32 %len.addr.014, -2
-  %cmp = icmp sgt i32 %sub5, 1
-  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
-
-while.cond.while.end_crit_edge:                   ; preds = %while.body
-  %add4.lcssa = phi i64 [ %add4, %while.body ]
-  %phitmp = trunc i64 %add4.lcssa to i32
-  br label %while.end
-
-while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
-  %sum4.0.lcssa = phi i32 [ %phitmp, %while.cond.while.end_crit_edge ], [ 0, %entry ]
-  ret i32 %sum4.0.lcssa
-  unreachable
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
deleted file mode 100644
index 80ea0506f774..000000000000
--- a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-none-linux"
-
-;void foo(int *A, int *B, int m, int n) {
-;  for (int i = m; i < n; i+=4) {
-;    A[i+0] = B[i+0] * 4;
-;    A[i+1] = B[i+1] * 4;
-;    A[i+2] = B[i+2] * 4;
-;    A[i+3] = B[i+3] * 4;
-;  }
-;}
-define void @foo(ptr nocapture %A, ptr nocapture readonly %B, i32 %m, i32 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP34:%.*]] = icmp slt i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 4
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], [[M]]
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[M]], [[INDVAR]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP7]], 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP5]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp34 = icmp slt i32 %m, %n
-  br i1 %cmp34, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.035 = phi i32 [ %add18, %for.body ], [ %m, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.035
-  %0 = load i32, ptr %arrayidx, align 4
-  %mul = shl nsw i32 %0, 2
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.035
-  store i32 %mul, ptr %arrayidx2, align 4
-  %add3 = add nsw i32 %i.035, 1
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i32 %add3
-  %1 = load i32, ptr %arrayidx4, align 4
-  %mul5 = shl nsw i32 %1, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %A, i32 %add3
-  store i32 %mul5, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %i.035, 2
-  %arrayidx9 = getelementptr inbounds i32, ptr %B, i32 %add8
-  %2 = load i32, ptr %arrayidx9, align 4
-  %mul10 = shl nsw i32 %2, 2
-  %arrayidx12 = getelementptr inbounds i32, ptr %A, i32 %add8
-  store i32 %mul10, ptr %arrayidx12, align 4
-  %add13 = add nsw i32 %i.035, 3
-  %arrayidx14 = getelementptr inbounds i32, ptr %B, i32 %add13
-  %3 = load i32, ptr %arrayidx14, align 4
-  %mul15 = shl nsw i32 %3, 2
-  %arrayidx17 = getelementptr inbounds i32, ptr %A, i32 %add13
-  store i32 %mul15, ptr %arrayidx17, align 4
-  %add18 = add nsw i32 %i.035, 4
-  %cmp = icmp slt i32 %add18, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-;void daxpy_ur(int n,float da,ptr dx,ptr dy)
-;    {
-;    int m = n % 4;
-;    for (int i = m; i < n; i = i + 4)
-;        {
-;        dy[i]   = dy[i]   + da*dx[i];
-;        dy[i+1] = dy[i+1] + da*dx[i+1];
-;        dy[i+2] = dy[i+2] + da*dx[i+2];
-;        dy[i+3] = dy[i+3] + da*dx[i+3];
-;        }
-;    }
-define void @daxpy_ur(i32 %n, float %da, ptr nocapture readonly %dx, ptr nocapture %dy) {
-; CHECK-LABEL: @daxpy_ur(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N:%.*]], 4
-; CHECK-NEXT:    [[CMP55:%.*]] = icmp slt i32 [[REM]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP55]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[REM]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[REM]], [[INDVAR]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DY:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[DX:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP7]], [[DA:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP6]], [[MUL]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %rem = srem i32 %n, 4
-  %cmp55 = icmp slt i32 %rem, %n
-  br i1 %cmp55, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.056 = phi i32 [ %add27, %for.body ], [ %rem, %entry ]
-  %arrayidx = getelementptr inbounds float, ptr %dy, i32 %i.056
-  %0 = load float, ptr %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %dx, i32 %i.056
-  %1 = load float, ptr %arrayidx1, align 4
-  %mul = fmul float %1, %da
-  %add = fadd float %0, %mul
-  store float %add, ptr %arrayidx, align 4
-  %add3 = add nsw i32 %i.056, 1
-  %arrayidx4 = getelementptr inbounds float, ptr %dy, i32 %add3
-  %2 = load float, ptr %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds float, ptr %dx, i32 %add3
-  %3 = load float, ptr %arrayidx6, align 4
-  %mul7 = fmul float %3, %da
-  %add8 = fadd float %2, %mul7
-  store float %add8, ptr %arrayidx4, align 4
-  %add11 = add nsw i32 %i.056, 2
-  %arrayidx12 = getelementptr inbounds float, ptr %dy, i32 %add11
-  %4 = load float, ptr %arrayidx12, align 4
-  %arrayidx14 = getelementptr inbounds float, ptr %dx, i32 %add11
-  %5 = load float, ptr %arrayidx14, align 4
-  %mul15 = fmul float %5, %da
-  %add16 = fadd float %4, %mul15
-  store float %add16, ptr %arrayidx12, align 4
-  %add19 = add nsw i32 %i.056, 3
-  %arrayidx20 = getelementptr inbounds float, ptr %dy, i32 %add19
-  %6 = load float, ptr %arrayidx20, align 4
-  %arrayidx22 = getelementptr inbounds float, ptr %dx, i32 %add19
-  %7 = load float, ptr %arrayidx22, align 4
-  %mul23 = fmul float %7, %da
-  %add24 = fadd float %6, %mul23
-  store float %add24, ptr %arrayidx20, align 4
-  %add27 = add nsw i32 %i.056, 4
-  %cmp = icmp slt i32 %add27, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoopReroll/ptrindvar.ll b/llvm/test/Transforms/LoopReroll/ptrindvar.ll
deleted file mode 100644
index 90f6353197b2..000000000000
--- a/llvm/test/Transforms/LoopReroll/ptrindvar.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=loop-reroll   %s | FileCheck %s
-target triple = "aarch64--linux-gnu"
-
-define i32 @test(ptr readonly %buf, ptr readnone %end) #0 {
-; CHECK-LABEL: define i32 @test
-; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BUF2:%.*]] = ptrtoint ptr [[BUF]] to i64
-; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[BUF2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[INDVAR]], 2
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp.9 = icmp eq ptr %buf, %end
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-
-  %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.010, align 4
-  %add = add nsw i32 %0, %S.011
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 2
-  %cmp = icmp eq ptr %add.ptr, %end
-  br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit:
-  %add2.lcssa = phi i32 [ %add2, %while.body ]
-  br label %while.end
-
-while.end:
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
-
-define i32 @test2(ptr readonly %buf, ptr readnone %end) #0 {
-; CHECK-LABEL: define i32 @test2
-; CHECK-SAME: (ptr readonly [[BUF:%.*]], ptr readnone [[END:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[END2:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[BUF1:%.*]] = ptrtoint ptr [[BUF]] to i64
-; CHECK-NEXT:    [[CMP_9:%.*]] = icmp eq ptr [[BUF]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[BUF1]], -8
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[END2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[S_011:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ undef, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[INDVAR]], -4
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BUF]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP6]], [[S_011]]
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    br label [[WHILE_END]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[ADD2_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
-;
-entry:
-  %cmp.9 = icmp eq ptr %buf, %end
-  br i1 %cmp.9, label %while.end, label %while.body.preheader
-
-while.body.preheader:
-  br label %while.body
-
-while.body:
-
-  %S.011 = phi i32 [ %add2, %while.body ], [ undef, %while.body.preheader ]
-  %buf.addr.010 = phi ptr [ %add.ptr, %while.body ], [ %buf, %while.body.preheader ]
-  %0 = load i32, ptr %buf.addr.010, align 4
-  %add = add nsw i32 %0, %S.011
-  %arrayidx1 = getelementptr inbounds i32, ptr %buf.addr.010, i64 -1
-  %1 = load i32, ptr %arrayidx1, align 4
-  %add2 = add nsw i32 %add, %1
-  %add.ptr = getelementptr inbounds i32, ptr %buf.addr.010, i64 -2
-  %cmp = icmp eq ptr %add.ptr, %end
-  br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit:
-  %add2.lcssa = phi i32 [ %add2, %while.body ]
-  br label %while.end
-
-while.end:
-  %S.0.lcssa = phi i32 [ undef, %entry ], [ %add2.lcssa, %while.end.loopexit ]
-  ret i32 %S.0.lcssa
-}
diff --git a/llvm/test/Transforms/LoopReroll/reduction.ll b/llvm/test/Transforms/LoopReroll/reduction.ll
deleted file mode 100644
index 94f4d53bfbf6..000000000000
--- a/llvm/test/Transforms/LoopReroll/reduction.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @foo(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %add = add nsw i32 %0, %r.029
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1
-  %2 = load i32, ptr %arrayidx3, align 4
-  %add4 = add nsw i32 %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5
-  %6 = load i32, ptr %arrayidx11, align 4
-  %add12 = add nsw i32 %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @foo
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-; CHECK: %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvar
-; CHECK: %1 = load i32, ptr %arrayidx, align 4
-; CHECK: %add = add nsw i32 %1, %r.029
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i32 %0, 399
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret i32 %add12
-}
-
-define float @bar(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi float [ 0.0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx, align 4
-  %add = fadd float %0, %r.029
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds float, ptr %x, i64 %1
-  %2 = load float, ptr %arrayidx3, align 4
-  %add4 = fadd float %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds float, ptr %x, i64 %3
-  %4 = load float, ptr %arrayidx7, align 4
-  %add8 = fadd float %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds float, ptr %x, i64 %5
-  %6 = load float, ptr %arrayidx11, align 4
-  %add12 = fadd float %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @bar
-
-; CHECK: for.body:
-; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
-; CHECK: %r.029 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
-; CHECK: %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvar
-; CHECK: %1 = load float, ptr %arrayidx, align 4
-; CHECK: %add = fadd float %1, %r.029
-; CHECK: %indvar.next = add i64 %indvar, 1
-; CHECK: %exitcond = icmp eq i32 %0, 399
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret float %add12
-}
-
-define i32 @foo_unusedphi(ptr nocapture readonly %x) #0 {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %x, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %add = add nsw i32 %0, %0
-  %1 = or disjoint i64 %indvars.iv, 1
-  %arrayidx3 = getelementptr inbounds i32, ptr %x, i64 %1
-  %2 = load i32, ptr %arrayidx3, align 4
-  %add4 = add nsw i32 %add, %2
-  %3 = or disjoint i64 %indvars.iv, 2
-  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %3
-  %4 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %add4, %4
-  %5 = or disjoint i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, ptr %x, i64 %5
-  %6 = load i32, ptr %arrayidx11, align 4
-  %add12 = add nsw i32 %add8, %6
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
-  %7 = trunc i64 %indvars.iv.next to i32
-  %cmp = icmp slt i32 %7, 400
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK-LABEL: @foo_unusedphi
-; The above is just testing for a crash - no specific output expected.
-
-; CHECK: ret
-
-for.end:                                          ; preds = %for.body
-  ret i32 %add12
-}
-
-attributes #0 = { nounwind readonly uwtable }
-
diff --git a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll b/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll
deleted file mode 100644
index e720e761f4d6..000000000000
--- a/llvm/test/Transforms/LoopReroll/reroll_with_dbg.ll
+++ /dev/null
@@ -1,130 +0,0 @@
-;RUN: opt < %s -passes=loop-reroll -S | FileCheck %s
-;void foo(ptr restrict a, ptr restrict b, int n) {
-;  for(int i = 0; i < n; i+=4) {
-;    a[i] = b[i];
-;    a[i+1] = b[i+1];
-;    a[i+2] = b[i+2];
-;    a[i+3] = b[i+3];
-;  }
-;}
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv4t--linux-gnueabi"
-
-; Function Attrs: nounwind
-define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %n) #0 !dbg !4 {
-entry:
-;CHECK-LABEL: @foo
-
-  tail call void @llvm.dbg.value(metadata ptr %a, metadata !12, metadata !22), !dbg !23
-  tail call void @llvm.dbg.value(metadata ptr %b, metadata !13, metadata !22), !dbg !24
-  tail call void @llvm.dbg.value(metadata i32 %n, metadata !14, metadata !22), !dbg !25
-  tail call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !22), !dbg !26
-  %cmp.30 = icmp sgt i32 %n, 0, !dbg !27
-  br i1 %cmp.30, label %for.body.preheader, label %for.cond.cleanup, !dbg !29
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body, !dbg !30
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup, !dbg !32
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void, !dbg !32
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-;CHECK: for.body:
-;CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, {{.*}} ]
-;CHECK: load
-;CHECK: store
-;CHECK-NOT: load
-;CHECK-NOT: store
-;CHECK: call void @llvm.dbg.value
-;CHECK: %indvar.next = add i32 %indvar, 1
-;CHECK: icmp eq i32 %indvar
-  %i.031 = phi i32 [ %add13, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i32 %i.031, !dbg !30
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !30, !tbaa !33
-  %arrayidx1 = getelementptr inbounds float, ptr %a, i32 %i.031, !dbg !37
-  store i32 %0, ptr %arrayidx1, align 4, !dbg !38, !tbaa !33
-  %add = or disjoint i32 %i.031, 1, !dbg !39
-  %arrayidx2 = getelementptr inbounds float, ptr %b, i32 %add, !dbg !40
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !40, !tbaa !33
-  %arrayidx4 = getelementptr inbounds float, ptr %a, i32 %add, !dbg !41
-  store i32 %1, ptr %arrayidx4, align 4, !dbg !42, !tbaa !33
-  %add5 = or disjoint i32 %i.031, 2, !dbg !43
-  %arrayidx6 = getelementptr inbounds float, ptr %b, i32 %add5, !dbg !44
-  %2 = load i32, ptr %arrayidx6, align 4, !dbg !44, !tbaa !33
-  %arrayidx8 = getelementptr inbounds float, ptr %a, i32 %add5, !dbg !45
-  store i32 %2, ptr %arrayidx8, align 4, !dbg !46, !tbaa !33
-  %add9 = or disjoint i32 %i.031, 3, !dbg !47
-  %arrayidx10 = getelementptr inbounds float, ptr %b, i32 %add9, !dbg !48
-  %3 = load i32, ptr %arrayidx10, align 4, !dbg !48, !tbaa !33
-  %arrayidx12 = getelementptr inbounds float, ptr %a, i32 %add9, !dbg !49
-  store i32 %3, ptr %arrayidx12, align 4, !dbg !50, !tbaa !33
-  %add13 = add nuw nsw i32 %i.031, 4, !dbg !51
-  tail call void @llvm.dbg.value(metadata i32 %add13, metadata !15, metadata !22), !dbg !26
-  %cmp = icmp slt i32 %add13, %n, !dbg !27
-  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !dbg !29
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!17, !18, !19, !20}
-!llvm.ident = !{!21}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "test.c", directory: "/home/weimingz/llvm-build/release/community-tip")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null, !7, !7, !10}
-!7 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !8)
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 32, align: 32)
-!9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!12, !13, !14, !15}
-!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1, type: !7)
-!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 1, type: !7)
-!14 = !DILocalVariable(name: "n", arg: 3, scope: !4, file: !1, line: 1, type: !10)
-!15 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 2, type: !10)
-!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 2, column: 3)
-!17 = !{i32 2, !"Dwarf Version", i32 4}
-!18 = !{i32 2, !"Debug Info Version", i32 3}
-!19 = !{i32 1, !"wchar_size", i32 4}
-!20 = !{i32 1, !"min_enum_size", i32 4}
-!21 = !{!"clang version 3.8.0"}
-!22 = !DIExpression()
-!23 = !DILocation(line: 1, column: 27, scope: !4)
-!24 = !DILocation(line: 1, column: 47, scope: !4)
-!25 = !DILocation(line: 1, column: 54, scope: !4)
-!26 = !DILocation(line: 2, column: 11, scope: !16)
-!27 = !DILocation(line: 2, column: 20, scope: !28)
-!28 = distinct !DILexicalBlock(scope: !16, file: !1, line: 2, column: 3)
-!29 = !DILocation(line: 2, column: 3, scope: !16)
-!30 = !DILocation(line: 3, column: 12, scope: !31)
-!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 2, column: 31)
-!32 = !DILocation(line: 8, column: 1, scope: !4)
-!33 = !{!34, !34, i64 0}
-!34 = !{!"float", !35, i64 0}
-!35 = !{!"omnipotent char", !36, i64 0}
-!36 = !{!"Simple C/C++ TBAA"}
-!37 = !DILocation(line: 3, column: 5, scope: !31)
-!38 = !DILocation(line: 3, column: 10, scope: !31)
-!39 = !DILocation(line: 4, column: 17, scope: !31)
-!40 = !DILocation(line: 4, column: 14, scope: !31)
-!41 = !DILocation(line: 4, column: 5, scope: !31)
-!42 = !DILocation(line: 4, column: 12, scope: !31)
-!43 = !DILocation(line: 5, column: 17, scope: !31)
-!44 = !DILocation(line: 5, column: 14, scope: !31)
-!45 = !DILocation(line: 5, column: 5, scope: !31)
-!46 = !DILocation(line: 5, column: 12, scope: !31)
-!47 = !DILocation(line: 6, column: 17, scope: !31)
-!48 = !DILocation(line: 6, column: 14, scope: !31)
-!49 = !DILocation(line: 6, column: 5, scope: !31)
-!50 = !DILocation(line: 6, column: 12, scope: !31)
-!51 = !DILocation(line: 2, column: 26, scope: !28)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
index cb9ba1b088cb..22aaa563daa5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 ; Test integer induction variable of step 2:
 ;   for (int i = 0; i < 1024; i+=2) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
index c47a63020281..4f7195934a79 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=aarch64--linux-gnueabi -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
+; RUN: opt -mtriple=aarch64 -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
 
 ; The following tests contain loops for which SCEV cannot determine the backedge
 ; taken count. This is because the backedge taken condition is produced by an
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index bf9146bbad4c..78798725b5b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -5,7 +5,7 @@
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnueabi"
+target triple = "aarch64"
 
 %i8.2 = type {i8, i8}
 define void @i8_factor_2(ptr %data, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 29d8719db9b2..50a5cc6774c5 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -3523,10 +3523,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
-; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
+; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; IND-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; IND-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -3589,10 +3589,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
-; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
+; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; UNROLL-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; UNROLL-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -3733,10 +3733,10 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]]
+; INTERLEAVE-NEXT:    [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
@@ -3907,11 +3907,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; IND-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
-; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -2
+; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; IND-NEXT:    [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; IND-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2
+; IND-NEXT:    [[EXT_MUL5:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; IND-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL5]], 2
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -3976,11 +3976,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
-; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
+; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
+; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; UNROLL-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -4126,11 +4126,11 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -8
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]]
-; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2
+; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
+; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
index 4ea6d7f42839..97faa23cb516 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll
@@ -88,46 +88,10 @@ declare <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float>, <7 x fl
 define <1 x double> @dotproduct_double_v6(<6 x double> %a, <6 x double> %b) {
 ; CHECK-LABEL: @dotproduct_double_v6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[A:%.*]], <6 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 4>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <1 x i32> <i32 5>
-; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <6 x double> [[B:%.*]], <6 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x double> [[SPLIT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK7:%.*]] = shufflevector <1 x double> [[SPLIT1]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT8]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK7]], <1 x double> [[SPLAT_SPLAT9]], <1 x double> [[TMP1]])
-; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <1 x double> [[SPLIT2]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK10]], <1 x double> [[SPLAT_SPLAT12]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <1 x double> [[SPLIT3]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 3
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT14]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK13]], <1 x double> [[SPLAT_SPLAT15]], <1 x double> [[TMP5]])
-; CHECK-NEXT:    [[BLOCK16:%.*]] = shufflevector <1 x double> [[SPLIT4]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 4
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT17]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK16]], <1 x double> [[SPLAT_SPLAT18]], <1 x double> [[TMP7]])
-; CHECK-NEXT:    [[BLOCK19:%.*]] = shufflevector <1 x double> [[SPLIT5]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x double> [[SPLIT6]], i64 5
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT20]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK19]], <1 x double> [[SPLAT_SPLAT21]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> <i32 1>
-; CHECK-NEXT:    ret <1 x double> [[TMP13]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul <6 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    ret <1 x double> [[TMP2]]
 ;
 entry:
   %c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 1, i32 6, i32 1)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
index 82ae93b31035..33a338dbc4ea 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -4,31 +4,35 @@
 define <8 x double> @fadd_transpose(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @fadd_transpose(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[SPLIT]], [[SPLIT3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SPLIT2]], [[SPLIT4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP12]], i64 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x double> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[TMP1]], i64 3
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP16]], i64 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP18]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x double> [[TMP20]]
 ;
 entry:
@@ -42,40 +46,37 @@ define <8 x double> @load_fadd_transpose(ptr %A.Ptr, <8 x double> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A_PTR]], i64 2
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[A_PTR]], i64 4
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[A_PTR]], i64 6
-; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <2 x double>, ptr [[VEC_GEP5]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> [[COL_LOAD2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> [[COL_LOAD6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[SPLIT]], [[SPLIT8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[SPLIT7]], [[SPLIT9]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x double> [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP11]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[TMP3]], i64 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x double> [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP15]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x double> [[TMP3]], i64 3
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP17]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[TMP4]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP19]], i64 1
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x double> [[TMP23]]
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[A_PTR]], i64 4
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <2 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr double, ptr [[A_PTR]], i64 6
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, ptr [[VEC_GEP4]], align 8
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[COL_LOAD]], [[SPLIT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[COL_LOAD1]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[COL_LOAD3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[COL_LOAD5]], [[SPLIT8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
 ;
 
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
new file mode 100644
index 000000000000..fcf83b03bc3d
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-opts-lifting.ll
@@ -0,0 +1,151 @@
+; RUN: opt -p lower-matrix-intrinsics -matrix-print-after-transpose-opt -disable-output -S %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; FIXME: Lifted transpose dimensions are incorrect.
+define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_matching_transpose_dimensions(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_matching_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_matching_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_1(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 1, i32 6)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 1, i32 6)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_1_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_mismatching_dimensions_1_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 1, i32 6)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 1, i32 6)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 3, i32 2)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL:  define <6 x double> @lift_through_add_mismatching_dimensions_2(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = fadd <6 x double> %a, %b
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[A]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 6, i32 1)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <6 x double> @lift_through_add_mismatching_dimensions_2_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr) {
+; CHECK-LABEL: define <6 x double> @lift_through_add_mismatching_dimensions_2_transpose_dimensions_ops_also_have_shape_info(ptr %a.ptr, ptr %b.ptr)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.+]] = load <6 x double>, ptr %a.ptr
+; CHECK-NEXT:    [[B:%.+]] = load <6 x double>, ptr %b.ptr
+; CHECK-NEXT:    [[ADD:%.+]] = fadd <6 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[ADD]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a = load <6 x double>, ptr %a.ptr
+  %b = load <6 x double>, ptr %b.ptr
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 6, i32 1)
+  %add = fadd <6 x double> %a.t, %b.t
+  ret <6 x double> %add
+}
+
+define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
+; CHECK-LABEL: define <9 x double> @lift_through_multiply(<6 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64.v6f64(<6 x double> %b, <6 x double> %a, i32 3, i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> [[MUL]], i32 3, i32 3)
+; CHECK-NEXT:   ret <9 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 2, i32 3)
+  %mul = call <9 x double> @llvm.matrix.multiply.v9f64.v6f64(<6 x double> %a.t, <6 x double> %b.t, i32 3, i32 2 , i32 3)
+  ret <9 x double> %mul
+}
+
+define <6 x double> @lift_through_multiply_2(<6 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: define <6 x double> @lift_through_multiply_2(<6 x double> %a, <4 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <6 x double> @llvm.matrix.multiply.v6f64.v4f64.v6f64(<4 x double> %b, <6 x double> %a, i32 2, i32 2, i32 3)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[MUL]], i32 2, i32 3)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %a, i32 3, i32 2)
+  %b.t = call <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> %b, i32 2, i32 2)
+  %mul = call <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double> %a.t, <4 x double> %b.t, i32 3, i32 2 , i32 2)
+  ret <6 x double> %mul
+}
+
+define <6 x double> @lift_through_multiply_3(<4 x double> %a, <6 x double> %b) {
+; CHECK-LABEL: define <6 x double> @lift_through_multiply_3(<4 x double> %a, <6 x double> %b) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.+]] = call <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double> %b, <4 x double> %a, i32 3, i32 2, i32 2)
+; CHECK-NEXT:    [[T:%.+]] = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> [[MUL]], i32 3, i32 2)
+; CHECK-NEXT:    ret <6 x double> [[T]]
+;
+entry:
+  %a.t = call <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> %a, i32 2, i32 2)
+  %b.t = call <6 x double> @llvm.matrix.transpose.v6f64(<6 x double> %b, i32 2, i32 3)
+  %mul = call <6 x double> @llvm.matrix.multiply.v6f64.v4f64.v6f64(<4 x double> %a.t, <6 x double> %b.t, i32 2, i32 2 , i32 3)
+  ret <6 x double> %mul
+}
+
+declare <6 x double> @llvm.matrix.transpose.v6f64.v6f64(<6 x double>, i32, i32)
+declare <4 x double> @llvm.matrix.transpose.v4f64.v4f64(<4 x double>, i32, i32)
+declare <9 x double> @llvm.matrix.multiply.v9f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
+declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v4f64(<6 x double>, <4 x double>, i32, i32, i32)
+declare <6 x double> @llvm.matrix.multiply.v6f64.v6f64.v6f64(<6 x double>, <4 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
new file mode 100644
index 000000000000..034af89112cb
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-annotation.ll
@@ -0,0 +1,68 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -passes=lowertypetests %s -o %t.o
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-foobar
+; CHECK-foobar: {{llvm.global.annotations = .*[foo|bar], .*[foo|bar],}}
+; RUN: llvm-dis %t.o -o - | FileCheck %s --check-prefix=CHECK-cfi
+; CHECK-cfi-NOT: {{llvm.global.annotations = .*cfi.*}}
+
+target triple = "aarch64-none-linux-gnu"
+
+@.src = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
+@.str = private unnamed_addr constant [30 x i8] c"annotation_string_literal_bar\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", section "llvm.metadata"
+@.str.2 = private unnamed_addr constant [30 x i8] c"annotation_string_literal_foo\00", section "llvm.metadata"
+@llvm.global.annotations = appending global [2 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @bar, ptr @.str, ptr @.str.1, i32 2, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @foo, ptr @.str.2, ptr @.str.1, i32 1, ptr null }], section "llvm.metadata"
+
+define i32 @bar(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  store i32 %0, ptr %2, align 4
+  %3 = load i32, ptr %2, align 4
+  %4 = call i32 @foo(i32 noundef %3)
+  ret i32 %4
+}
+
+declare !type !8 !type !9 i32 @foo(i32 noundef) #1
+
+define i32 @test(i32 noundef %0) #0 !type !8 !type !9 {
+  %2 = alloca i32, align 4
+  %3 = alloca ptr, align 8
+  store i32 %0, ptr %2, align 4
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp sgt i32 %4, 0
+  %6 = zext i1 %5 to i64
+  %7 = select i1 %5, ptr @foo, ptr @bar
+  store ptr %7, ptr %3, align 8
+  %8 = load ptr, ptr %3, align 8
+  %9 = call i1 @llvm.type.test(ptr %8, metadata !"_ZTSFiiE"), !nosanitize !10
+  br i1 %9, label %11, label %10, !nosanitize !10
+
+10:
+  call void @llvm.ubsantrap(i8 2) #4, !nosanitize !10
+  unreachable, !nosanitize !10
+
+11:
+  %12 = load i32, ptr %2, align 4
+  %13 = call i32 %8(i32 noundef %12)
+  ret i32 %13
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.ubsantrap(i8 immarg)
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" }
+attributes #4 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 4, !"CFI Canonical Jump Tables", i32 0}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{i32 1, !"ThinLTO", i32 0}
+!6 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!8 = !{i64 0, !"_ZTSFiiE"}
+!9 = !{i64 0, !"_ZTSFiiE.generalized"}
+!10 = !{}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
new file mode 100644
index 000000000000..6132c35c96ca
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx11.0.0"
+
+define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @partial_unroll_forced(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.latch.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N]], 1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
+; CHECK:       loop.latch.preheader.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16
+; CHECK-NEXT:    [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <8 x half> [[L]], [[L]]
+; CHECK-NEXT:    store <8 x half> [[ADD]], ptr [[DST_IDX]], align 16
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[SRC_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load <8 x half>, ptr [[SRC_IDX_1]], align 16
+; CHECK-NEXT:    [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]]
+; CHECK-NEXT:    store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
+; CHECK:       loop.latch.epil:
+; CHECK-NEXT:    [[SRC_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[L_EPIL:%.*]] = load <8 x half>, ptr [[SRC_IDX_EPIL]], align 16
+; CHECK-NEXT:    [[DST_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[ADD_EPIL:%.*]] = fadd <8 x half> [[L_EPIL]], [[L_EPIL]]
+; CHECK-NEXT:    store <8 x half> [[ADD_EPIL]], ptr [[DST_IDX_EPIL]], align 16
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp14 = icmp slt i32 %iv, %N
+  br i1 %cmp14, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.ext = zext i32 %iv to i64
+  %src.idx = getelementptr <8 x half>, ptr %src, i64 %iv.ext
+  %l = load <8 x half>, ptr %src.idx, align 16
+  %dst.idx = getelementptr <8 x half>, ptr %dst, i64 %iv.ext
+  %add = fadd <8 x half> %l, %l
+  store <8 x half> %add, ptr %dst.idx, align 16
+  %iv.next = add i32 %iv, 1
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.unroll.count", i32 2}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
new file mode 100644
index 000000000000..c6c9a52167d5
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx11.0.0"
+
+define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %count, i32 noundef %n) {
+; CHECK-LABEL: define i32 @read_only_loop_with_runtime_check(
+; CHECK-SAME: ptr nocapture noundef readonly [[ARRAY:%.*]], i32 noundef [[COUNT:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP6_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i32 [[TMP1]], [[COUNT]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD]], [[IF_END]] ]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP2]], [[SUM_07]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %array.addr = alloca ptr, align 8
+  %count.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %sum = alloca i32, align 4
+  %i = alloca i32, align 4
+  store ptr %array, ptr %array.addr, align 8
+  store i32 %count, ptr %count.addr, align 4
+  store i32 %n, ptr %n.addr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #3
+  store i32 0, ptr %sum, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, ptr %i, align 4
+  %3 = load i32, ptr %count.addr, align 4
+  %cmp1 = icmp uge i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %4 = load ptr, ptr %array.addr, align 8
+  %5 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %4, i64 %idxprom
+  %6 = load i32, ptr %arrayidx, align 4
+  %7 = load i32, ptr %sum, align 4
+  %add = add nsw i32 %7, %6
+  store i32 %add, ptr %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %8 = load i32, ptr %i, align 4
+  %inc = add i32 %8, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  %9 = load i32, ptr %sum, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %sum)
+  ret i32 %9
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+declare void @llvm.trap()
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index a7e8e1580411..af24a9ab9e3f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -14,18 +14,81 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[XA]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[XB]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000
+; CHECK-NEXT:    [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]]
+; CHECK-NEXT:    [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23
+; CHECK-NEXT:    [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[VECTOR_MEMCHECK:%.*]], label [[FOR_BODY_PREHEADER13:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP10]], i64 32000)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 32000
+; CHECK-NEXT:    [[UMIN:%.*]] = zext i1 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD11]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD12]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP22]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP23]], align 4, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
+; CHECK:       for.body.preheader13:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER13]] ]
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L_A]], [[L_B]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP1]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -63,6 +126,14 @@ for.end:
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.mustprogress"}
 ;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]], [[META7]]}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index 77f53ad56e1c..e514defc91e2 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -21,7 +21,7 @@ define dso_local void @_Z3fooPiii(ptr %A, i32 %N, i32 %M) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR6]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    tail call void @_Z1fi(i32 [[TMP2]])
-; CHECK-NEXT:    [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT7]] = add nuw i64 [[INDVAR6]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/SCCP/pr79696.ll b/llvm/test/Transforms/SCCP/pr79696.ll
new file mode 100644
index 000000000000..a860112d5ef3
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/pr79696.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
+; Tests from PR79696
+
+define i1 @constant_range_xor(i64 %a) {
+; CHECK-LABEL: define i1 @constant_range_xor(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], 8192
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true)
+; CHECK-NEXT:    [[CONV:%.*]] = xor i64 [[CTLZ]], 63
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ugt i64 %a, 8192
+  br i1 %cmp, label %then, label %else
+then:
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50]
+  %conv = xor i64 %ctlz, 63                        ;[13, 63]
+  %cmp1 = icmp ult i64 %conv, 13
+  ret i1 %cmp1
+else:
+  ret i1 false
+}
+
+define i1 @constant_range_xor_negative(i64 %a) {
+; CHECK-LABEL: define i1 @constant_range_xor_negative(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], 8192
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CTLZ:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A]], i1 true)
+; CHECK-NEXT:    [[CONV:%.*]] = xor i64 [[CTLZ]], 62
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[CONV]], 13
+; CHECK-NEXT:    ret i1 [[CMP1]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ugt i64 %a, 8192
+  br i1 %cmp, label %then, label %else
+then:
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true) ;[0, 50]
+  %conv = xor i64 %ctlz, 62                        ;[12, 63]
+  %cmp1 = icmp ult i64 %conv, 13
+  ret i1 %cmp1
+else:
+  ret i1 false
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index dc5fb9178863..e167b6a47af5 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
@@ -67,305 +67,303 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
 ; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX20_3]], i32 1
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[ARRAYIDX22_3]], i32 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
 ; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX3_3]], i32 0
-; CHECK-NEXT:    [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 4, i64 6>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]]
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 1, i64 3>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 5, i64 7>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]]
-; CHECK-NEXT:    [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
+; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
+; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
+; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
+; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]]
-; CHECK-NEXT:    [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
-; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]]
-; CHECK-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]]
-; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]]
-; CHECK-NEXT:    [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]]
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]]
-; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]]
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0
-; CHECK-NEXT:    [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]]
-; CHECK-NEXT:    [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP138:%.*]] = and <2 x i32> [[TMP137]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]]
-; CHECK-NEXT:    [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
+; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
+; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
+; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
+; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
+; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
+; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
+; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
+; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
+; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32>
-; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
-; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]]
-; CHECK-NEXT:    [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
-; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32>
-; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]]
-; CHECK-NEXT:    [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]]
-; CHECK-NEXT:    [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
-; CHECK-NEXT:    [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]]
-; CHECK-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]]
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]]
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]]
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15
+; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
+; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
+; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
+; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
+; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
+; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
+; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
+; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
+; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
+; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
+; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP186:%.*]] = and <2 x i32> [[TMP185]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]]
-; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP194:%.*]] = and <2 x i32> [[TMP193]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]]
-; CHECK-NEXT:    [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]]
-; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]]
-; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP218:%.*]] = and <2 x i32> [[TMP217]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]]
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]]
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0
-; CHECK-NEXT:    [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]]
-; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP235:%.*]] = and <2 x i32> [[TMP234]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1
-; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]]
-; CHECK-NEXT:    [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]]
+; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
+; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
+; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
+; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
+; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
+; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
+; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
+; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
+; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
+; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]]
+; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]]
-; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]]
-; CHECK-NEXT:    [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]]
-; CHECK-NEXT:    [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]]
-; CHECK-NEXT:    [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]]
-; CHECK-NEXT:    [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]]
-; CHECK-NEXT:    [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]]
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]]
-; CHECK-NEXT:    [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]]
-; CHECK-NEXT:    [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]]
-; CHECK-NEXT:    [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]]
-; CHECK-NEXT:    [[TMP267:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP223]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]]
-; CHECK-NEXT:    [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]]
-; CHECK-NEXT:    [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]]
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]]
+; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
+; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
+; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
+; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
+; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
+; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
+; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
+; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
+; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
+; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
+; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
+; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
+; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
+; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
+; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
+; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
+; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
+; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
+; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]]
-; CHECK-NEXT:    [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]]
+; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
+; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]]
-; CHECK-NEXT:    [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]]
-; CHECK-NEXT:    [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]]
-; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]]
-; CHECK-NEXT:    [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]]
-; CHECK-NEXT:    [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP288:%.*]] = and <2 x i32> [[TMP287]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]]
-; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]]
-; CHECK-NEXT:    [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]]
+; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
+; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
+; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
+; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
+; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
+; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
+; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
+; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
+; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
+; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index a4cc311d12a2..4b0b41970bbb 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -5,61 +5,12 @@ define void @test(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
-; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
-; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -262,67 +213,40 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
-; CHECK-NEXT:    [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ST5:%.*]] = mul i64 [[STR]], 6
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
 ; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
-; CHECK-NEXT:    store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
-; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    [[ST4:%.*]] = mul i64 [[STR]], 5
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
 ; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
-; CHECK-NEXT:    store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
-; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[ST3:%.*]] = mul i64 [[STR]], 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
 ; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
-; CHECK-NEXT:    store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
 ; CHECK-NEXT:    [[ST2:%.*]] = mul i64 [[STR]], 3
-; CHECK-NEXT:    [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
 ; CHECK-NEXT:    [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
-; CHECK-NEXT:    store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
-; CHECK-NEXT:    [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[ST1:%.*]] = mul i64 [[STR]], 2
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
 ; CHECK-NEXT:    [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
-; CHECK-NEXT:    store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
-; CHECK-NEXT:    [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
 ; CHECK-NEXT:    [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
-; CHECK-NEXT:    store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
-; CHECK-NEXT:    [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
 ; CHECK-NEXT:    [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
-; CHECK-NEXT:    store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -397,27 +321,12 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 5aba9ea115a4..ec152c707eec 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,7 +8,7 @@ define i16 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[PPREV_058_I]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,7 +17,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
index 8f2c72bb4c68..8ab57cc73e64 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll
@@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   %0 = load i8, ptr %a, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll
new file mode 100644
index 000000000000..b4c8a1c58769
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reorder-same-node.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> zeroinitializer)
+; CHECK-NEXT:    store i64 [[TMP1]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = zext i8 0 to i32
+  %2 = lshr i32 0, %1
+  %3 = icmp ult i32 %2, 0
+  %4 = shl i32 0, %1
+  %5 = and i32 %4, 0
+  %narrow = select i1 %3, i32 0, i32 %5
+  %6 = zext i32 %narrow to i64
+  %7 = zext i8 0 to i32
+  %8 = lshr i32 0, %7
+  %9 = icmp ult i32 %8, 0
+  %10 = shl i32 0, %7
+  %11 = and i32 %10, 0
+  %narrow.1 = select i1 %9, i32 0, i32 %11
+  %12 = zext i32 %narrow.1 to i64
+  %13 = xor i64 %6, %12
+  %14 = zext i8 0 to i32
+  %15 = lshr i32 0, %14
+  %16 = icmp ult i32 %15, 0
+  %17 = shl i32 0, %14
+  %18 = and i32 %17, 0
+  %narrow.2 = select i1 %16, i32 0, i32 %18
+  %19 = zext i32 %narrow.2 to i64
+  %20 = xor i64 %13, %19
+  %21 = icmp ult i32 %8, 0
+  %22 = shl i32 0, %7
+  %23 = and i32 %22, 0
+  %narrow.3 = select i1 %21, i32 0, i32 %23
+  %24 = zext i32 %narrow.3 to i64
+  %25 = xor i64 %20, %24
+  %26 = icmp ult i32 %15, 0
+  %27 = shl i32 0, %14
+  %28 = and i32 %27, 0
+  %narrow.4 = select i1 %26, i32 0, i32 %28
+  %29 = zext i32 %narrow.4 to i64
+  %30 = xor i64 %25, %29
+  %31 = icmp ult i32 %8, 0
+  %32 = shl i32 0, %7
+  %33 = and i32 %32, 0
+  %narrow.5 = select i1 %31, i32 0, i32 %33
+  %34 = zext i32 %narrow.5 to i64
+  %35 = xor i64 %30, %34
+  %36 = icmp ult i32 %15, 0
+  %37 = shl i32 0, %14
+  %38 = and i32 %37, 0
+  %narrow.6 = select i1 %36, i32 0, i32 %38
+  %39 = zext i32 %narrow.6 to i64
+  %40 = xor i64 %35, %39
+  %41 = icmp ult i32 %8, 0
+  %42 = shl i32 0, %7
+  %43 = and i32 %42, 0
+  %narrow.7 = select i1 %41, i32 0, i32 %43
+  %44 = zext i32 %narrow.7 to i64
+  %45 = xor i64 %40, %44
+  store i64 %45, ptr null, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 96d4c307f1c6..9e43cefef280 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -30,7 +30,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD:       bb:
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> <i64 12, i64 8, i64 4, i64 0>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 1add732d32e8..3bc6e64606e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
new file mode 100644
index 000000000000..6560fc6a1452
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
+; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
+  %l.src.0 = load i32, ptr %gep.src.0, align 4
+  %mul.0 = mul nsw i32 %l.src.0, 10
+
+  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
+  %l.src.1 = load i32, ptr %gep.src.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1, 10
+
+  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %mul.2 = mul nsw i32 %l.src.2, 10
+
+  store i32 %mul.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %mul.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %mul.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_1_0:%.*]] = load i32, ptr [[GEP_SRC_1_0]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_2_0:%.*]] = load i32, ptr [[GEP_SRC_2_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_1_0]], [[L_SRC_2_0]]
+; CHECK-NEXT:    [[GEP_SRC_1_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 1
+; CHECK-NEXT:    [[L_SRC_1_1:%.*]] = load i32, ptr [[GEP_SRC_1_1]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 1
+; CHECK-NEXT:    [[L_SRC_2_1:%.*]] = load i32, ptr [[GEP_SRC_2_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1_1]], [[L_SRC_2_1]]
+; CHECK-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; CHECK-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; CHECK-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; CHECK-NEXT:    store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[MUL_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
+  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
+  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
+  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
+  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
+
+  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
+  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
+  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
+  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
+
+  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
+  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
+  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
+  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
+  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
+
+  store i32 %mul.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %mul.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %mul.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_mul_add_const_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_1_0:%.*]] = load i32, ptr [[GEP_SRC_1_0]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; CHECK-NEXT:    [[L_SRC_2_0:%.*]] = load i32, ptr [[GEP_SRC_2_0]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_1_0]], [[L_SRC_2_0]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_0]], 9
+; CHECK-NEXT:    [[GEP_SRC_1_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 1
+; CHECK-NEXT:    [[L_SRC_1_1:%.*]] = load i32, ptr [[GEP_SRC_1_1]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_1:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 1
+; CHECK-NEXT:    [[L_SRC_2_1:%.*]] = load i32, ptr [[GEP_SRC_2_1]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1_1]], [[L_SRC_2_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_1]], 9
+; CHECK-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; CHECK-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; CHECK-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; CHECK-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
+; CHECK-NEXT:    store i32 [[ADD_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT:    store i32 [[ADD_1]], ptr [[DST_1]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
+  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
+  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
+  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
+  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
+  %add.0 = add i32 %mul.0, 9
+
+  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
+  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
+  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
+  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
+  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
+  %add.1 = add i32 %mul.1, 9
+
+  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
+  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
+  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
+  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
+  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
+  %add.2 = add i32 %mul.2, 9
+
+  store i32 %add.0, ptr %dst
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  store i32 %add.1, ptr %dst.1
+
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+  store i32 %add.2, ptr %dst.2
+
+  ret void
+}
+
+define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
+; CHECK-NEXT:    store float [[FADD_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
+  %l.src.0 = load float , ptr %gep.src.0, align 4
+  %fadd.0 = fadd float %l.src.0, 10.0
+
+  %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
+  %l.src.1 = load float, ptr %gep.src.1, align 4
+  %fadd.1 = fadd float %l.src.1, 10.0
+
+  %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
+  %l.src.2 = load float, ptr %gep.src.2, align 4
+  %fadd.2 = fadd float %l.src.2, 10.0
+
+  store float %fadd.0, ptr %dst
+
+  %dst.1 = getelementptr float, ptr %dst, i32 1
+  store float %fadd.1, ptr %dst.1
+
+  %dst.2 = getelementptr float, ptr %dst, i32 2
+  store float %fadd.2, ptr %dst.2
+
+  ret void
+}
+
+define void @phi_store3(ptr %dst) {
+; CHECK-LABEL: @phi_store3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       invoke.cont8.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
+; CHECK-NEXT:    store <2 x i32> [[TMP0]], ptr [[DST]], align 4
+; CHECK-NEXT:    store i32 [[P_2]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %exit
+
+invoke.cont8.loopexit:                            ; No predecessors!
+  br label %exit
+
+exit:
+  %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
+  %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
+  %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
+
+  %dst.1 = getelementptr i32, ptr %dst, i32 1
+  %dst.2 = getelementptr i32, ptr %dst, i32 2
+
+  store i32 %p.0, ptr %dst, align 4
+  store i32 %p.1, ptr %dst.1, align 4
+  store i32 %p.2, ptr %dst.2, align 4
+  ret void
+}
+
+define void @store_try_reorder(ptr %dst) {
+; CHECK-LABEL: @store_try_reorder(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 0, 0
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add i32 0, 0
+  store i32 %add, ptr %dst, align 4
+  %add207 = sub i32 0, 0
+  %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
+  store i32 %add207, ptr %arrayidx.i1887, align 4
+  %add216 = sub i32 0, 0
+  %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
+  store i32 %add216, ptr %arrayidx.i1891, align 4
+  ret void
+}
+
+define void @vec3_fpext_cost(ptr %Colour, float %0) {
+; CHECK-LABEL: @vec3_fpext_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[CONV78:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
+; CHECK-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx72 = getelementptr float, ptr %Colour, i64 1
+  %arrayidx80 = getelementptr float, ptr %Colour, i64 2
+  %conv62 = fpext float %0 to double
+  %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
+  %conv66 = fptrunc double %1 to float
+  store float %conv66, ptr %Colour, align 4
+  %conv70 = fpext float %0 to double
+  %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
+  %conv74 = fptrunc double %2 to float
+  store float %conv74, ptr %arrayidx72, align 4
+  %conv78 = fpext float %0 to double
+  %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
+  %conv82 = fptrunc double %3 to float
+  store float %conv82, ptr %arrayidx80, align 4
+  ret void
+}
+
+define void @fpext_gather(ptr %dst, double %conv) {
+; CHECK-LABEL: @fpext_gather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr [[LENGTHS]], align 4
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv25 = fptrunc double %conv to float
+  %Lengths = getelementptr float, ptr %dst, i64 0
+  store float %conv25, ptr %Lengths, align 4
+  %arrayidx32 = getelementptr float, ptr %dst, i64 1
+  store float %conv25, ptr %arrayidx32, align 4
+  %conv34 = fptrunc double %conv to float
+  %arrayidx37 = getelementptr float, ptr %dst, i64 2
+  store float %conv34, ptr %arrayidx37, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
new file mode 100644
index 000000000000..71b9315839ec
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+define void @vec3_vectorize_call(ptr %Colour, float %0) {
+; CHECK-LABEL: @vec3_vectorize_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = load float, ptr %Colour, align 4
+  %2 = call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float 0.000000e+00)
+  store float %2, ptr %Colour, align 4
+  %arrayidx91.i = getelementptr float, ptr %Colour, i64 1
+  %3 = load float, ptr %arrayidx91.i, align 4
+  %4 = call float @llvm.fmuladd.f32(float %3, float 0.000000e+00, float 0.000000e+00)
+  store float %4, ptr %arrayidx91.i, align 4
+  %arrayidx99.i1 = getelementptr float, ptr %Colour, i64 2
+  %5 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  store float %5, ptr %arrayidx99.i1, align 4
+  ret void
+}
+
+define void @vec3_fmuladd_64(ptr %Colour, double %0) {
+; CHECK-LABEL: @vec3_fmuladd_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = fptrunc <2 x double> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[COLOUR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP5]] to float
+; CHECK-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx72 = getelementptr float, ptr %Colour, i64 1
+  %arrayidx80 = getelementptr float, ptr %Colour, i64 2
+  %1 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv66 = fptrunc double %1 to float
+  store float %conv66, ptr %Colour, align 4
+  %2 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv74 = fptrunc double %2 to float
+  store float %conv74, ptr %arrayidx72, align 4
+  %3 = call double @llvm.fmuladd.f64(double %0, double 0.000000e+00, double 0.000000e+00)
+  %conv82 = fptrunc double %3 to float
+  store float %conv82, ptr %arrayidx80, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
new file mode 100644
index 000000000000..1411f9416f69
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_insert_loads(ptr %A, ptr noalias %B, float %0) #0 {
+; CHECK-LABEL: define void @test_insert_loads(
+; CHECK-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> <float 3.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 2.000000e+00>)
+; CHECK-NEXT:    [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28
+; CHECK-NEXT:    [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4
+; CHECK-NEXT:    [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12
+; CHECK-NEXT:    [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28
+; CHECK-NEXT:    [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 4.000000e+00>, float [[L_A_12]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 4.000000e+00>, float [[L_B_0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]])
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4
+; CHECK-NEXT:    store float [[MULADD_0]], ptr [[GEP_28]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %muladd.0 = tail call float @llvm.fmuladd.f32(float %0, float 1.000000e+00, float 1.000000e+00)
+  %muladd.1 = tail call float @llvm.fmuladd.f32(float %0, float 2.000000e+00, float 2.000000e+00)
+  %muladd.2 = tail call float @llvm.fmuladd.f32(float %0, float 3.000000e+00, float 3.000000e+00)
+  %muladd.3 = tail call float @llvm.fmuladd.f32(float %0, float 4.000000e+00, float 4.000000e+00)
+  %A.28 = getelementptr i8, ptr %A, i64 28
+  %l.A.28 = load float, ptr %A.28, align 4
+  %muladd.4 = tail call float @llvm.fmuladd.f32(float %0, float %l.A.28, float 0.000000e+00)
+  %muladd.5 = tail call float @llvm.fmuladd.f32(float %0, float %l.A.28, float 0.000000e+00)
+  %A.12 = getelementptr i8, ptr %A, i64 12
+  %l.A.12  = load float, ptr %A.12, align 4
+  %gep.4  = getelementptr i8, ptr %B, i64 4
+  %gep.12 = getelementptr i8, ptr %B, i64 12
+  %l.B.0 = load float, ptr %B, align 4
+  %muladd.6  = tail call float @llvm.fmuladd.f32(float %0, float %l.A.12, float %l.B.0)
+  %gep.28 = getelementptr i8, ptr %B, i64 28
+  %gep.24 = getelementptr i8, ptr %B, i64 24
+  %gep.20 = getelementptr i8, ptr %B, i64 20
+  %gep.16 = getelementptr i8, ptr %B, i64 16
+  %gep.8 = getelementptr i8, ptr %B, i64 8
+  store float %muladd.6, ptr %gep.4, align 4
+  store float %muladd.5, ptr %gep.8, align 8
+  store float %muladd.4, ptr %gep.12, align 4
+  store float %muladd.3, ptr %gep.16, align 16
+  store float %muladd.2, ptr %gep.20, align 4
+  store float %muladd.1, ptr %gep.24, align 8
+  store float %muladd.0, ptr %gep.28, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+attributes #0 = { "target-cpu"="skylake-avx512" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
new file mode 100644
index 000000000000..9584a663b2d4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -0,0 +1,513 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+
+%struct.zot = type { i32, i32, i32 }
+
+define i1 @reorder_results(ptr %arg, i1 %arg1, ptr %arg2, i64 %arg3, ptr %arg4) {
+; CHECK-LABEL: define i1 @reorder_results(
+; CHECK-SAME: ptr [[ARG:%.*]], i1 [[ARG1:%.*]], ptr [[ARG2:%.*]], i64 [[ARG3:%.*]], ptr [[ARG4:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG4]], align 8
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[LOAD]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[LOAD]], i64 4
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 4
+; CHECK-NEXT:    br i1 [[ARG1]], label [[BB12:%.*]], label [[BB9:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       bb9:
+; CHECK-NEXT:    [[FREEZE:%.*]] = freeze ptr [[ARG]]
+; CHECK-NEXT:    store i32 [[LOAD4]], ptr [[FREEZE]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR10:%.*]] = getelementptr i8, ptr [[FREEZE]], i64 4
+; CHECK-NEXT:    store i32 [[LOAD7]], ptr [[GETELEMENTPTR10]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr i8, ptr [[FREEZE]], i64 8
+; CHECK-NEXT:    store i32 [[LOAD5]], ptr [[GETELEMENTPTR11]], align 4
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr [[STRUCT_ZOT:%.*]], ptr [[ARG2]], i64 [[ARG3]]
+; CHECK-NEXT:    store i32 [[LOAD4]], ptr [[GETELEMENTPTR13]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR14:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR13]], i64 4
+; CHECK-NEXT:    store i32 [[LOAD7]], ptr [[GETELEMENTPTR14]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR13]], i64 8
+; CHECK-NEXT:    store i32 [[LOAD5]], ptr [[GETELEMENTPTR15]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+;
+bb:
+  %load = load ptr, ptr %arg4, align 8
+  %load4 = load i32, ptr %load, align 4
+  %getelementptr = getelementptr i8, ptr %load, i64 4
+  %load5 = load i32, ptr %getelementptr, align 4
+  %getelementptr6 = getelementptr i8, ptr %load, i64 8
+  %load7 = load i32, ptr %getelementptr6, align 4
+  br i1 %arg1, label %bb12, label %bb9
+
+bb8:                                              ; preds = %bb12, %bb9
+  ret i1 false
+
+bb9:                                              ; preds = %bb
+  %freeze = freeze ptr %arg
+  store i32 %load4, ptr %freeze, align 4
+  %getelementptr10 = getelementptr i8, ptr %freeze, i64 4
+  store i32 %load7, ptr %getelementptr10, align 4
+  %getelementptr11 = getelementptr i8, ptr %freeze, i64 8
+  store i32 %load5, ptr %getelementptr11, align 4
+  br label %bb8
+
+bb12:                                             ; preds = %bb
+  %getelementptr13 = getelementptr %struct.zot, ptr %arg2, i64 %arg3
+  store i32 %load4, ptr %getelementptr13, align 4
+  %getelementptr14 = getelementptr i8, ptr %getelementptr13, i64 4
+  store i32 %load7, ptr %getelementptr14, align 4
+  %getelementptr15 = getelementptr i8, ptr %getelementptr13, i64 8
+  store i32 %load5, ptr %getelementptr15, align 4
+  br label %bb8
+}
+
+define void @extract_mask(ptr %object, double %conv503, double %conv520) {
+; CHECK-LABEL: define void @extract_mask(
+; CHECK-SAME: ptr [[OBJECT:%.*]], double [[CONV503:%.*]], double [[CONV520:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OBJECT]], align 8
+; CHECK-NEXT:    [[BBOX483:%.*]] = getelementptr float, ptr [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[BBOX483]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[CONV503]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> [[TMP4]], <double 0.000000e+00, double -2.000000e+10>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> [[TMP3]], <2 x double> <double 0.000000e+00, double -2.000000e+10>
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <2 x double> [[TMP7]] to <2 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[MUL646:%.*]] = fmul float [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[CMP663:%.*]] = fcmp olt float [[MUL646]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP663]], label [[IF_THEN665:%.*]], label [[IF_END668:%.*]]
+; CHECK:       if.then665:
+; CHECK-NEXT:    [[ARRAYIDX656:%.*]] = getelementptr float, ptr [[OBJECT]], i64 10
+; CHECK-NEXT:    [[BBOX651:%.*]] = getelementptr float, ptr [[OBJECT]]
+; CHECK-NEXT:    [[CONV613:%.*]] = fptrunc double [[CONV503]] to float
+; CHECK-NEXT:    store float [[CONV613]], ptr [[BBOX651]], align 8
+; CHECK-NEXT:    [[BBOX_SROA_6_0_BBOX666_SROA_IDX:%.*]] = getelementptr float, ptr [[OBJECT]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP6]], double [[CONV520]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <2 x double> [[TMP11]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[BBOX_SROA_6_0_BBOX666_SROA_IDX]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX656]], align 8
+; CHECK-NEXT:    br label [[IF_END668]]
+; CHECK:       if.end668:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load ptr, ptr %object, align 8
+  %bbox483 = getelementptr float, ptr %0
+  %1 = load float, ptr %bbox483, align 8
+  %conv486 = fpext float %1 to double
+  %cmp487 = fcmp ogt double %conv486, -2.000000e+10
+  %conv486.2 = select i1 %cmp487, double %conv486, double -2.000000e+10
+  %arrayidx502 = getelementptr float, ptr %0, i64 1
+  %2 = load float, ptr %arrayidx502, align 4
+  %conv5033 = fpext float %2 to double
+  %cmp504 = fcmp ogt double %conv503, 0.000000e+00
+  %cond514 = select i1 %cmp504, double %conv5033, double 0.000000e+00
+  %sub626 = fsub double 0.000000e+00, %conv486.2
+  %conv627 = fptrunc double %sub626 to float
+  %sub632 = fsub double 0.000000e+00, %cond514
+  %conv633 = fptrunc double %sub632 to float
+  %mul646 = fmul float %conv633, %conv627
+  %cmp663 = fcmp olt float %mul646, 0.000000e+00
+  br i1 %cmp663, label %if.then665, label %if.end668
+
+if.then665:                                       ; preds = %entry
+  %arrayidx656 = getelementptr float, ptr %object, i64 10
+  %lengths652 = getelementptr float, ptr %object, i64 11
+  %bbox651 = getelementptr float, ptr %object
+  %conv621 = fptrunc double %conv520 to float
+  %conv617 = fptrunc double %cond514 to float
+  %conv613 = fptrunc double %conv503 to float
+  store float %conv613, ptr %bbox651, align 8
+  %bbox.sroa.6.0.bbox666.sroa_idx = getelementptr float, ptr %object, i64 1
+  store float %conv617, ptr %bbox.sroa.6.0.bbox666.sroa_idx, align 4
+  %bbox.sroa.8.0.bbox666.sroa_idx = getelementptr float, ptr %object, i64 2
+  store float %conv621, ptr %bbox.sroa.8.0.bbox666.sroa_idx, align 8
+  store float %conv627, ptr %lengths652, align 4
+  store float %conv633, ptr %arrayidx656, align 8
+  br label %if.end668
+
+if.end668:                                        ; preds = %if.then665, %entry
+  ret void
+}
+
+define void @gather_2(ptr %mat1, float %0, float %1) {
+; CHECK-LABEL: define void @gather_2(
+; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
+; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  %3 = call float @llvm.fmuladd.f32(float %1, float %0, float 0.000000e+00)
+  %4 = call float @llvm.fmuladd.f32(float %0, float %1, float 0.000000e+00)
+  %5 = fmul float %2, 0.000000e+00
+  %6 = fmul float %3, 0.000000e+00
+  %7 = fmul float %4, 0.000000e+00
+  %arrayidx163 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1
+  %arrayidx2.i.i.i278 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1, i64 1
+  %arrayidx5.i.i.i280 = getelementptr [4 x [4 x float]], ptr %mat1, i64 0, i64 1, i64 2
+  store float %5, ptr %arrayidx163, align 4
+  store float %6, ptr %arrayidx2.i.i.i278, align 4
+  store float %7, ptr %arrayidx5.i.i.i280, align 4
+  ret void
+}
+
+define i32 @reorder_indices_1(float %0) {
+; CHECK-LABEL: define i32 @reorder_indices_1(
+; CHECK-SAME: float [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; CHECK-NEXT:    [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
+; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %nor1 = alloca [0 x [3 x float]], i32 0, align 4
+  %arrayidx.i = getelementptr float, ptr %nor1, i64 1
+  %1 = load float, ptr %arrayidx.i, align 4
+  %arrayidx2.i265 = getelementptr float, ptr %nor1, i64 2
+  %2 = load float, ptr %arrayidx2.i265, align 4
+  %3 = fneg float %2
+  %neg.i267 = fmul float %3, %0
+  %4 = call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float %neg.i267)
+  %5 = load float, ptr %nor1, align 4
+  %6 = fneg float %5
+  %neg11.i = fmul float %6, %0
+  %7 = call float @llvm.fmuladd.f32(float %2, float 0.000000e+00, float %neg11.i)
+  %8 = fneg float %1
+  %neg18.i = fmul float %8, %0
+  %9 = call float @llvm.fmuladd.f32(float %5, float 0.000000e+00, float %neg18.i)
+  %10 = call float @llvm.fmuladd.f32(float %0, float %9, float 0.000000e+00)
+  %11 = call float @llvm.fmuladd.f32(float %0, float %4, float 0.000000e+00)
+  %12 = call float @llvm.fmuladd.f32(float %0, float %7, float 0.000000e+00)
+  %mul.i.i.i = fmul float %10, 0.000000e+00
+  %mul3.i.i.i = fmul float %11, 0.000000e+00
+  %mul6.i.i.i = fmul float %12, 0.000000e+00
+  store float %mul.i.i.i, ptr %nor1, align 4
+  store float %mul3.i.i.i, ptr %arrayidx.i, align 4
+  store float %mul6.i.i.i, ptr %arrayidx2.i265, align 4
+  ret i32 0
+}
+
+define void @reorder_indices_2(ptr %spoint) {
+; CHECK-LABEL: define void @reorder_indices_2(
+; CHECK-SAME: ptr [[SPOINT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; CHECK-NEXT:    store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <3 x float> zeroinitializer, i64 1
+  %1 = extractelement <3 x float> zeroinitializer, i64 2
+  %2 = extractelement <3 x float> zeroinitializer, i64 0
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
+  %4 = tail call float @llvm.fmuladd.f32(float %1, float 0.000000e+00, float 0.000000e+00)
+  %5 = tail call float @llvm.fmuladd.f32(float %2, float 0.000000e+00, float 0.000000e+00)
+  %mul.i457 = fmul float %3, 0.000000e+00
+  %mul2.i459 = fmul float %4, 0.000000e+00
+  %mul4.i461 = fmul float %5, 0.000000e+00
+  %dsco = getelementptr float, ptr %spoint, i64 0
+  store float %mul.i457, ptr %dsco, align 4
+  %arrayidx3.i474 = getelementptr float, ptr %spoint, i64 1
+  store float %mul2.i459, ptr %arrayidx3.i474, align 4
+  %arrayidx5.i476 = getelementptr float, ptr %spoint, i64 2
+  store float %mul4.i461, ptr %arrayidx5.i476, align 4
+  ret void
+}
+
+define void @reorder_indices_2x_load(ptr %png_ptr, ptr %info_ptr) {
+; CHECK-LABEL: define void @reorder_indices_2x_load(
+; CHECK-SAME: ptr [[PNG_PTR:%.*]], ptr [[INFO_PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BIT_DEPTH:%.*]] = getelementptr i8, ptr [[INFO_PTR]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BIT_DEPTH]], align 4
+; CHECK-NEXT:    [[COLOR_TYPE:%.*]] = getelementptr i8, ptr [[INFO_PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[COLOR_TYPE]], align 1
+; CHECK-NEXT:    [[BIT_DEPTH37_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 11
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[BIT_DEPTH37_I]], align 1
+; CHECK-NEXT:    [[COLOR_TYPE39_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 10
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[COLOR_TYPE39_I]], align 2
+; CHECK-NEXT:    [[USR_BIT_DEPTH_I:%.*]] = getelementptr i8, ptr [[PNG_PTR]], i64 12
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[USR_BIT_DEPTH_I]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bit_depth = getelementptr i8, ptr %info_ptr, i64 0
+  %0 = load i8, ptr %bit_depth, align 4
+  %color_type = getelementptr i8, ptr %info_ptr, i64 1
+  %1 = load i8, ptr %color_type, align 1
+  %bit_depth37.i = getelementptr i8, ptr %png_ptr, i64 11
+  store i8 %0, ptr %bit_depth37.i, align 1
+  %color_type39.i = getelementptr i8, ptr %png_ptr, i64 10
+  store i8 %1, ptr %color_type39.i, align 2
+  %usr_bit_depth.i = getelementptr i8, ptr %png_ptr, i64 12
+  store i8 %0, ptr %usr_bit_depth.i, align 8
+  ret void
+}
+
+define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
+; CHECK-LABEL: define void @reuse_shuffle_indidces_1(
+; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; CHECK-NEXT:    [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul24 = fmul float %1, 0.000000e+00
+  %2 = fadd float %mul24, 0.000000e+00
+  store float %2, ptr %col, align 4
+  %arrayidx26 = getelementptr float, ptr %col, i64 1
+  %mul31 = fmul float %0, 0.000000e+00
+  %3 = fadd float %mul31, 0.000000e+00
+  store float %3, ptr %arrayidx26, align 4
+  %arrayidx33 = getelementptr float, ptr %col, i64 2
+  %mul38 = fmul float %0, 0.000000e+00
+  %4 = fadd float %mul38, 0.000000e+00
+  store float %4, ptr %arrayidx33, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_2(ptr %inertia, double %0) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_2(
+; CHECK-SAME: ptr [[INERTIA:%.*]], double [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP6]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    store <3 x float> [[TMP9]], ptr [[INERTIA]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x double> poison, double %0, i32 0
+  %2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer
+  %3 = fptrunc <2 x double> %2 to <2 x float>
+  %4 = fmul <2 x float> %3, zeroinitializer
+  %5 = shufflevector <2 x float> %4, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+  %6 = fadd <4 x float> %5, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %7 = fmul <4 x float> %6, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %8 = fadd <4 x float> %7, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>
+  %9 = shufflevector <4 x float> %8, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  store <3 x float> %9, ptr %inertia, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[BEZT]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %fneg = fmul float %0, 0.000000e+00
+  %1 = tail call float @llvm.fmuladd.f32(float %0, float %fneg, float 0.000000e+00)
+  store float %1, ptr %bezt, align 4
+  %2 = tail call float @llvm.fmuladd.f32(float %0, float %fneg, float 0.000000e+00)
+  %arrayidx5.i = getelementptr float, ptr %bezt, i64 1
+  store float %2, ptr %arrayidx5.i, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %fneg, float 0.000000e+00, float 0.000000e+00)
+  %arrayidx8.i831 = getelementptr float, ptr %bezt, i64 2
+  store float %3, ptr %arrayidx8.i831, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_3(ptr %m, double %conv, double %conv2) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_3(
+; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB19:%.*]] = fsub double 0.000000e+00, [[CONV2]]
+; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
+; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 0.000000e+00
+; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
+; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ADD26:%.*]] = fsub double [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
+; CHECK-NEXT:    store float [[CONV27]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub19 = fsub double 0.000000e+00, %conv2
+  %conv20 = fptrunc double %sub19 to float
+  store float %conv20, ptr %m, align 4
+  %add = fadd double %conv, 0.000000e+00
+  %conv239 = fptrunc double %add to float
+  %arrayidx25 = getelementptr [4 x float], ptr %m, i64 0, i64 1
+  store float %conv239, ptr %arrayidx25, align 4
+  %add26 = fsub double %conv, %conv
+  %conv27 = fptrunc double %add26 to float
+  %arrayidx29 = getelementptr [4 x float], ptr %m, i64 0, i64 2
+  store float %conv27, ptr %arrayidx29, align 4
+  ret void
+}
+
+define void @reuse_shuffle_indices_cost_crash_4(double %conv7.i) {
+; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_4(
+; CHECK-SAME: double [[CONV7_I:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA_I111:%.*]] = alloca [0 x [0 x [0 x [3 x float]]]], i32 0, align 4
+; CHECK-NEXT:    [[ARRAYIDX_2_I:%.*]] = getelementptr [3 x float], ptr [[DATA_I111]], i64 0, i64 2
+; CHECK-NEXT:    [[MUL17_I_US:%.*]] = fmul double [[CONV7_I]], 0.000000e+00
+; CHECK-NEXT:    [[MUL_2_I_I_US:%.*]] = fmul double [[MUL17_I_US]], 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV7_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[ADD_2_I_I_US:%.*]] = fadd double [[MUL_2_I_I_US]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[DATA_I111]], align 4
+; CHECK-NEXT:    [[CONV_2_I46_US:%.*]] = fptrunc double [[ADD_2_I_I_US]] to float
+; CHECK-NEXT:    store float [[CONV_2_I46_US]], ptr [[ARRAYIDX_2_I]], align 4
+; CHECK-NEXT:    [[CALL2_I_US:%.*]] = load volatile ptr, ptr [[DATA_I111]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %data.i111 = alloca [0 x [0 x [0 x [3 x float]]]], i32 0, align 4
+  %arrayidx.1.i = getelementptr [3 x float], ptr %data.i111, i64 0, i64 1
+  %arrayidx.2.i = getelementptr [3 x float], ptr %data.i111, i64 0, i64 2
+  %mul17.i.us = fmul double %conv7.i, 0.000000e+00
+  %mul.2.i.i.us = fmul double %mul17.i.us, 0.000000e+00
+  %add.i.i82.i.us = fadd double %conv7.i, 0.000000e+00
+  %add.1.i.i84.i.us = fadd double %conv7.i, 0.000000e+00
+  %mul.i.i91.i.us = fmul double %add.i.i82.i.us, %conv7.i
+  %mul.1.i.i92.i.us = fmul double %add.1.i.i84.i.us, %conv7.i
+  %add.i96.i.us = fadd double %mul.i.i91.i.us, 0.000000e+00
+  %add.1.i.i.us = fadd double %mul.1.i.i92.i.us, 0.000000e+00
+  %add.2.i.i.us = fadd double %mul.2.i.i.us, 0.000000e+00
+  %conv.i42.us = fptrunc double %add.i96.i.us to float
+  store float %conv.i42.us, ptr %data.i111, align 4
+  %conv.1.i44.us = fptrunc double %add.1.i.i.us to float
+  store float %conv.1.i44.us, ptr %arrayidx.1.i, align 4
+  %conv.2.i46.us = fptrunc double %add.2.i.i.us to float
+  store float %conv.2.i46.us, ptr %arrayidx.2.i, align 4
+  %call2.i.us = load volatile ptr, ptr %data.i111, align 8
+  ret void
+}
+
+define void @common_mask(ptr %m, double %conv, double %conv2) {
+; CHECK-LABEL: define void @common_mask(
+; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB19:%.*]] = fsub double [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
+; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV2]], 0.000000e+00
+; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
+; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[ADD26:%.*]] = fsub double 0.000000e+00, [[CONV]]
+; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
+; CHECK-NEXT:    store float [[CONV27]], ptr [[ARRAYIDX29]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub19 = fsub double %conv, %conv
+  %conv20 = fptrunc double %sub19 to float
+  store float %conv20, ptr %m, align 4
+  %add = fadd double %conv2, 0.000000e+00
+  %conv239 = fptrunc double %add to float
+  %arrayidx25 = getelementptr [4 x float], ptr %m, i64 0, i64 1
+  store float %conv239, ptr %arrayidx25, align 4
+  %add26 = fsub double 0.000000e+00, %conv
+  %conv27 = fptrunc double %add26 to float
+  %arrayidx29 = getelementptr [4 x float], ptr %m, i64 0, i64 2
+  store float %conv27, ptr %arrayidx29, align 4
+  ret void
+}
+
+define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
+; CHECK-LABEL: define void @vec3_extract(
+; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; CHECK-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
+  %red668 = getelementptr i16, ptr %call3.i536, i64 2
+  store i16 %pixel.sroa.0.4.vec.extract, ptr %red668, align 2
+  %pixel.sroa.0.2.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 1
+  %green670 = getelementptr i16, ptr %call3.i536, i64 1
+  store i16 %pixel.sroa.0.2.vec.extract, ptr %green670, align 2
+  %pixel.sroa.0.0.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 0
+  store i16 %pixel.sroa.0.0.vec.extract, ptr %call3.i536, align 2
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
index 56924a0a771b..1342a2ca4ea2 100644
--- a/llvm/test/Transforms/SROA/select-gep.ll
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -155,14 +155,24 @@ bb:
   ret i32 %load
 }
 
-
+; Test gep of index select unfolding on an alloca that is splittable, but not
+; promotable. The allocas here will be optimized away by subsequent passes.
 define i32 @test_select_idx_memcpy(i1 %c, ptr %p) {
 ; CHECK-LABEL: @test_select_idx_memcpy(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOCA]], ptr [[P:%.*]], i64 160, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_0:%.*]] = alloca [4 x i8], align 8
+; CHECK-NEXT:    [[ALLOCA_SROA_2:%.*]] = alloca [20 x i8], align 4
+; CHECK-NEXT:    [[ALLOCA_SROA_22:%.*]] = alloca [4 x i8], align 8
+; CHECK-NEXT:    [[ALLOCA_SROA_3:%.*]] = alloca [132 x i8], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_0]], ptr align 1 [[P:%.*]], i64 4, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_2_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_2]], ptr align 1 [[ALLOCA_SROA_2_0_P_SROA_IDX]], i64 20, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_22_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ALLOCA_SROA_22]], ptr align 1 [[ALLOCA_SROA_22_0_P_SROA_IDX]], i64 4, i1 false)
+; CHECK-NEXT:    [[ALLOCA_SROA_3_0_P_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ALLOCA_SROA_3]], ptr align 1 [[ALLOCA_SROA_3_0_P_SROA_IDX]], i64 132, i1 false)
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[ALLOCA_SROA_22]], ptr [[ALLOCA_SROA_0]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %alloca = alloca [20 x i64], align 8
@@ -173,16 +183,13 @@ define i32 @test_select_idx_memcpy(i1 %c, ptr %p) {
   ret i32 %res
 }
 
+; Test gep of index select unfolding on an alloca that is splittable and
+; promotable.
 define i32 @test_select_idx_mem2reg(i1 %c) {
 ; CHECK-LABEL: @test_select_idx_mem2reg(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
-; CHECK-NEXT:    store i32 1, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
-; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    [[RES_SROA_SPECULATED:%.*]] = select i1 [[C]], i32 2, i32 1
+; CHECK-NEXT:    ret i32 [[RES_SROA_SPECULATED]]
 ;
   %alloca = alloca [20 x i64], align 8
   store i32 1, ptr %alloca
@@ -194,6 +201,9 @@ define i32 @test_select_idx_mem2reg(i1 %c) {
   ret i32 %res
 }
 
+; Test gep of index select unfolding on an alloca that escaped, and as such
+; is not splittable or promotable.
+; FIXME: Ideally, no transform would take place in this case.
 define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
 ; CHECK-LABEL: @test_select_idx_escaped(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
@@ -202,8 +212,10 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
 ; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[IDX:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX]]
-; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[DOTSROA_GEP:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
+; CHECK-NEXT:    [[DOTSROA_GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 0
+; CHECK-NEXT:    [[IDX_SROA_SEL:%.*]] = select i1 [[C]], ptr [[DOTSROA_GEP]], ptr [[DOTSROA_GEP1]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[IDX_SROA_SEL]], align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %alloca = alloca [20 x i64], align 8
@@ -217,6 +229,38 @@ define i32 @test_select_idx_escaped(i1 %c, ptr %p) {
   ret i32 %res
 }
 
+; FIXME: Should we allow recursive select unfolding if all the leaves are
+; constants?
+define i32 @test_select_idx_nested(i1 %c, i1 %c2) {
+; CHECK-LABEL: @test_select_idx_nested(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
+; CHECK-NEXT:    store i32 1, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 8
+; CHECK-NEXT:    store i32 2, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 24
+; CHECK-NEXT:    store i32 3, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = select i1 [[C:%.*]], i64 24, i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = select i1 [[C2:%.*]], i64 [[IDX1]], i64 8
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[ALLOCA]], i64 [[IDX2]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[GEP3]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %alloca = alloca [20 x i64], align 8
+  store i32 1, ptr %alloca
+  %gep1 = getelementptr inbounds i8, ptr %alloca, i64 8
+  store i32 2, ptr %gep1
+  %gep2 = getelementptr inbounds i8, ptr %alloca, i64 24
+  store i32 3, ptr %gep2
+  %idx1 = select i1 %c, i64 24, i64 0
+  %idx2 = select i1 %c2, i64 %idx1, i64 8
+  %gep3 = getelementptr inbounds i8, ptr %alloca, i64 %idx2
+  %res = load i32, ptr %gep3, align 4
+  ret i32 %res
+}
+
+; The following cases involve non-constant indices and should not be
+; transformed.
+
 define i32 @test_select_idx_not_constant1(i1 %c, ptr %p, i64 %arg) {
 ; CHECK-LABEL: @test_select_idx_not_constant1(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [20 x i64], align 8
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
new file mode 100644
index 000000000000..4910e0e07ae3
--- /dev/null
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p sroa -S %s | FileCheck %s
+
+define void @load_store_transfer_split_struct_tbaa_2_float(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_float(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[B]] to i32
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[RES]], align 4
+; CHECK-NEXT:    [[RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES_SROA_IDX]], align 4
+; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %tmp.4 = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %tmp.4, align 4
+  %l1 = load i64, ptr %tmp, !tbaa.struct !0
+  store i64 %l1, ptr %res, !tbaa.struct !0
+  %p = load ptr, ptr %res, align 8
+  ret void
+}
+
+define void @memcpy_transfer(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @memcpy_transfer(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
+; CHECK-NEXT:    store float [[B]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %__im_.i.i, align 4
+  %l.ptr = load ptr, ptr %res, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 8, i1 false), !tbaa.struct !0
+  ret void
+}
+
+define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceable(24) %res, float %a, float %b) {
+; CHECK-LABEL: define void @memcpy_transfer_tbaa_field_and_size_do_not_align(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[RES]], align 8
+; CHECK-NEXT:    store float [[A]], ptr [[L_PTR]], align 1, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B]] to i32
+; CHECK-NEXT:    [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { float, float }, align 4
+  store float %a, ptr %tmp, align 4
+  %__im_.i.i = getelementptr inbounds i8, ptr %tmp, i64 4
+  store float %b, ptr %__im_.i.i, align 4
+  %l.ptr = load ptr, ptr %res, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %l.ptr, ptr %tmp, i64 6, i1 false), !tbaa.struct !0
+  ret void
+}
+
+define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24) %res, i31 %a, i31 %b) {
+; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_i31(
+; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], i31 [[A:%.*]], i31 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { i31, i31 }, align 4
+; CHECK-NEXT:    store i31 [[A]], ptr [[TMP]], align 4
+; CHECK-NEXT:    [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4
+; CHECK-NEXT:    store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4
+; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca { i31 , i31 }, align 4
+  store i31 %a, ptr %tmp, align 4
+  %tmp.4  = getelementptr inbounds i8, ptr %tmp, i64 4
+  store i31 %b, ptr %tmp.4, align 4
+  %l1 = load i62, ptr %tmp, !tbaa.struct !0
+  store i62 %l1, ptr %res, !tbaa.struct !0
+  ret void
+}
+
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
+
+!0 = !{i64 0, i64 4, !1, i64 4, i64 4, !1}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+;.
+; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 4, [[META1:![0-9]+]], i64 4, i64 4, [[META1]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"float", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 4, [[META1]]}
+;.
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index e2aa1e2ee1c7..e48dd5bb3920 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -2,6 +2,10 @@
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 ; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG
+;;  Ensure that these work with non-intrinsic variable locations.
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+; RUN: opt < %s -passes=debugify,sroa -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=DEBUG
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 %S1 = type { i64, [42 x float] }
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
index 2ab769f8d60e..cd135ce9e011 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.generated.expected
@@ -75,31 +75,31 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #0
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = memw(r29+#16)
+; CHECK-NEXT:     r1 = memw(r29+#8)
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#8) = #3
+; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
 ; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) memw(r29+#4) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#16) = #1
+; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) memw(r29+#8) = #1
+; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
@@ -116,27 +116,27 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
index 52dd5f150560..833bf68fc03d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/hexagon_generated_funcs.ll.nogenerated.expected
@@ -16,31 +16,31 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #0
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = memw(r29+#16)
+; CHECK-NEXT:     r1 = memw(r29+#8)
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
-; CHECK-NEXT:     if (p0.new) memw(r29+#8) = #3
+; CHECK-NEXT:     if (p0.new) memw(r29+#16) = #3
 ; CHECK-NEXT:     if (p0.new) memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) memw(r29+#4) = #4
-; CHECK-NEXT:     if (p0) memw(r29+#16) = #1
+; CHECK-NEXT:     if (p0) memw(r29+#20) = #4
+; CHECK-NEXT:     if (p0) memw(r29+#8) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (!p0) memw(r29+#8) = #1
+; CHECK-NEXT:     if (!p0) memw(r29+#16) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
@@ -93,27 +93,27 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_offset r31, -4
 ; CHECK-NEXT:    .cfi_offset r30, -8
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#20) = #0
+; CHECK-NEXT:     memw(r29+#4) = #0
 ; CHECK-NEXT:     memw(r0+#0) = #1
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    //# InlineAsm Start
 ; CHECK-NEXT:    //# InlineAsm End
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = #0
-; CHECK-NEXT:     memw(r29+#16) = #1
+; CHECK-NEXT:     memw(r29+#8) = #1
 ; CHECK-NEXT:     memw(r29+#12) = #2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     memw(r29+#8) = #3
-; CHECK-NEXT:     memw(r29+#4) = #4
+; CHECK-NEXT:     memw(r29+#16) = #3
+; CHECK-NEXT:     memw(r29+#20) = #4
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
diff --git a/llvm/test/tools/llvm-cov/mcdc-const.test b/llvm/test/tools/llvm-cov/mcdc-const.test
index 0b2c9c98d535..5424625cf6a6 100644
--- a/llvm/test/tools/llvm-cov/mcdc-const.test
+++ b/llvm/test/tools/llvm-cov/mcdc-const.test
@@ -61,8 +61,8 @@
 //      CHECKFULLCASE: |  C1-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
@@ -106,8 +106,8 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  -  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  -  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
@@ -118,8 +118,8 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  T  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  T  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
@@ -151,26 +151,26 @@
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: covered: (2,3)
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 100.00%
-//      CHECKFULLCASE: |  1 { T,  -,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  T,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  T,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  -,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  -  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  -  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  -,  C  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  T,  C  = T      }
+//      CHECKFULLCASE: |  1 { F,  T,  C  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  -,  C  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C3-Pair: constant folded
 //      CHECKFULLCASE: |  MC/DC Coverage for Decision: 0.00%
-//      CHECKFULLCASE: |  1 { T,  C,  -  = T      }
-// CHECKFULLCASE-NEXT: |  2 { F,  C,  T  = T      }
+//      CHECKFULLCASE: |  1 { F,  C,  T  = T      }
+// CHECKFULLCASE-NEXT: |  2 { T,  C,  -  = T      }
 //      CHECKFULLCASE: |  C1-Pair: not covered
 // CHECKFULLCASE-NEXT: |  C2-Pair: constant folded
 // CHECKFULLCASE-NEXT: |  C3-Pair: not covered
diff --git a/llvm/test/tools/llvm-cov/mcdc-general.test b/llvm/test/tools/llvm-cov/mcdc-general.test
index 753036bedaf1..4b59ce59d638 100644
--- a/llvm/test/tools/llvm-cov/mcdc-general.test
+++ b/llvm/test/tools/llvm-cov/mcdc-general.test
@@ -19,16 +19,16 @@
 // CHECK-NEXT:  |
 // CHECK-NEXT:  |     C1, C2, C3, C4    Result
 // CHECK-NEXT:  |  1 { F,  -,  F,  -  = F      }
-// CHECK-NEXT:  |  2 { T,  F,  F,  -  = F      }
-// CHECK-NEXT:  |  3 { F,  -,  T,  F  = F      }
+// CHECK-NEXT:  |  2 { F,  -,  T,  F  = F      }
+// CHECK-NEXT:  |  3 { T,  F,  F,  -  = F      }
 // CHECK-NEXT:  |  4 { T,  F,  T,  F  = F      }
-// CHECK-NEXT:  |  5 { T,  T,  -,  -  = T      }
-// CHECK-NEXT:  |  6 { T,  F,  T,  T  = T      }
+// CHECK-NEXT:  |  5 { T,  F,  T,  T  = T      }
+// CHECK-NEXT:  |  6 { T,  T,  -,  -  = T      }
 // CHECK-NEXT:  |
-// CHECK-NEXT:  |  C1-Pair: covered: (1,5)
-// CHECK-NEXT:  |  C2-Pair: covered: (2,5)
-// CHECK-NEXT:  |  C3-Pair: covered: (2,6)
-// CHECK-NEXT:  |  C4-Pair: covered: (4,6)
+// CHECK-NEXT:  |  C1-Pair: covered: (1,6)
+// CHECK-NEXT:  |  C2-Pair: covered: (3,6)
+// CHECK-NEXT:  |  C3-Pair: covered: (3,5)
+// CHECK-NEXT:  |  C4-Pair: covered: (4,5)
 // CHECK-NEXT:  |  MC/DC Coverage for Decision: 100.00%
 // CHECK-NEXT:  |
 // CHECK-NEXT:  ------------------
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml b/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml
new file mode 100644
index 000000000000..d10460896171
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug-str-offsets-mixed-dwarf-4-5.yaml
@@ -0,0 +1,57 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -debug-str-offsets -verify %t.o | FileCheck %s
+
+# CHECK: Verifying .debug_str_offsets...
+# CHECK: No errors
+
+# Check that when mixing standard DWARF 4 debug information with standard DWARF
+# 5 debug information, the verifier correctly interprets the debug_str_offsets
+# section as a standards-conforming DWARF 5 section.
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - 'cu1'
+    - 'cu2'
+  debug_str_offsets:
+    - Offsets:
+        - 0x0
+  debug_abbrev:
+    - Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x2
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strx1
+            - Attribute:       DW_AT_str_offsets_base
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x4
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AbbrOffset:      0x0
+      AddrSize:        8
+      AbbrevTableID:   0
+      Entries:
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+            - Value:           0x8 # str offsets base
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml
new file mode 100644
index 000000000000..1bab2c205cd5
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_empty_debug_line_sequence.yaml
@@ -0,0 +1,55 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+# CHECK: Verifying .debug_line...
+# CHECK: No errors
+
+# In a line table like the one below, with no rows (other than the
+# end_sequence), we should never verify the file index because the state
+# machine initializes the file index to 1, which is invalid in DWARF 5 due to
+# its 0-based indexing.
+
+# file_names[  0]:
+#            name: "/home/umb/tests_2018/106_rnglists2"
+#       dir_index: 0
+# Address            Line   Column File   ISA Discriminator OpIndex Flags
+# ------------------ ------ ------ ------ --- ------------- ------- -------------
+# 0x0000000000000000      1      0      1   0             0       0  is_stmt end_sequence
+
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Length:          0xd
+      Version:         5
+      UnitType:        DW_UT_compile
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+Sections:
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         300000000500080025000000010101fb0e0d00010101010000000100000101011f010000000002011f020b010000000000000101
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    Content:         2F686F6D652F756D622F74657374735F323031382F3130365F726E676C697374733200746573742E63707000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
index 37f376352a19..1bdc640acae6 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_str_offsets.yaml
@@ -13,7 +13,7 @@
 # CHECK-NEXT: error: .debug_str_offsets: contribution 0x29: length exceeds available space (contribution offset (0x29) + length field space (0x4) + length (0x5000000) == 0x500002D > section size 0x30)
 # Errors detected.
 
-# V4: error: .debug_str_offsets: contribution 0x0: index 0x2: invalid string offset *0x8 == 0x2, is neither zero nor immediately following a null character
+# V4: error: .debug_str_offsets.dwo: contribution 0x0: index 0x2: invalid string offset *0x8 == 0x2, is neither zero nor immediately following a null character
 
 
 #--- v4.yaml
@@ -23,16 +23,17 @@ FileHeader:
   Data:  ELFDATA2LSB
   Type:  ET_EXEC
 DWARF:
-  debug_str:
-    - 'foo'
-    - 'bar'
-  debug_info:
-    - Version: 4
-      AddrSize: 4
 Sections:
-  - Name: '.debug_str_offsets'
+  - Name: '.debug_info.dwo'
+    Type: SHT_PROGBITS
+    Content: "0700000004000000000004"
+  - Name: '.debug_str_offsets.dwo'
     Type: SHT_PROGBITS
     Content: "000000000400000002000000"
+  - Name: 'debug_str.dwo'
+    Type: SHT_PROGBITS
+    Content: "666F6F0062617200"
+
 
 #--- v5.yaml
 --- !ELF
diff --git a/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml b/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
index 109f2fe73e69..b430e8573b31 100644
--- a/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
+++ b/llvm/test/tools/llvm-gsymutil/X86/elf-dwo.yaml
@@ -15,8 +15,8 @@
 
 ## WARNING-QUIET: Input file: {{.*\.yaml\.tmp}}
 ## WARNING-QUIET: Output file (x86_64): {{.*\.yaml\.tmp\.gsym}}
-## WARNING-QUIET: warning: Unable to retrieve DWO .debug_info section for some object files. (Remove the --quiet flag for full output)
 ## WARNING-QUIET: Pruned 0 functions, ended with 10 total
+## WARNING-QUIET: warning: Unable to retrieve DWO .debug_info section for some object files. (Remove the --quiet flag for full output)
 
 
 
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 4250c775daa6..77bdc23589fd 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -11,9 +11,23 @@ ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
 ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 ARMAP-EMPTY:
 ARMAP-NEXT: Archive EC map
+ARMAP-NEXT: #expname in test.dll
+ARMAP-NEXT: #funcexp in test.dll
+ARMAP-NEXT: #mangledfunc in test.dll
+ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
+ARMAP-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAP-NEXT: __imp_aux_expname in test.dll
+ARMAP-NEXT: __imp_aux_funcexp in test.dll
+ARMAP-NEXT: __imp_aux_mangledfunc in test.dll
 ARMAP-NEXT: __imp_dataexp in test.dll
+ARMAP-NEXT: __imp_expname in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
+ARMAP-NEXT: __imp_mangledfunc in test.dll
+ARMAP-NEXT: expname in test.dll
 ARMAP-NEXT: funcexp in test.dll
+ARMAP-NEXT: mangledfunc in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
 
@@ -35,10 +49,42 @@ READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
 READOBJ-NEXT: Type: code
-READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Name type: export as
 READOBJ-NEXT: Export name: funcexp
 READOBJ-NEXT: Symbol: __imp_funcexp
 READOBJ-NEXT: Symbol: funcexp
+READOBJ-NEXT: Symbol: __imp_aux_funcexp
+READOBJ-NEXT: Symbol: #funcexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: mangledfunc
+READOBJ-NEXT: Symbol: __imp_mangledfunc
+READOBJ-NEXT: Symbol: mangledfunc
+READOBJ-NEXT: Symbol: __imp_aux_mangledfunc
+READOBJ-NEXT: Symbol: #mangledfunc
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z
+READOBJ-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expname
+READOBJ-NEXT: Symbol: __imp_expname
+READOBJ-NEXT: Symbol: expname
+READOBJ-NEXT: Symbol: __imp_aux_expname
+READOBJ-NEXT: Symbol: #expname
 READOBJ-EMPTY:
 READOBJ-NEXT: File: test.dll
 READOBJ-NEXT: Format: COFF-import-file-ARM64EC
@@ -51,8 +97,308 @@ Creating a new lib containing the existing lib:
 RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib
 RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
 
+
+RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test.def -out:testx.lib
+
+RUN: llvm-nm --print-armap testx.lib | FileCheck -check-prefix=ARMAPX %s
+
+ARMAPX:      Archive map
+ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAPX-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAPX-NEXT: __imp_#mangledfunc in test.dll
+ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_dataexp in test.dll
+ARMAPX-NEXT: __imp_expname in test.dll
+ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: expname in test.dll
+ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
+ARMAPX-EMPTY:
+ARMAPX-NEXT: Archive EC map
+ARMAPX-NEXT: #expname in test.dll
+ARMAPX-NEXT: #funcexp in test.dll
+ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test.dll
+ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test.dll
+ARMAPX-NEXT: __imp_aux_expname in test.dll
+ARMAPX-NEXT: __imp_aux_funcexp in test.dll
+ARMAPX-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_dataexp in test.dll
+ARMAPX-NEXT: __imp_expname in test.dll
+ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: __imp_mangledfunc in test.dll
+ARMAPX-NEXT: expname in test.dll
+ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: mangledfunc in test.dll
+
+RUN: llvm-readobj testx.lib | FileCheck -check-prefix=READOBJX %s
+
+READOBJX:      File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: testx.lib(test.dll)
+READOBJX-NEXT: Format: COFF-ARM64
+READOBJX-NEXT: Arch: aarch64
+READOBJX-NEXT: AddressSize: 64bit
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: funcexp
+READOBJX-NEXT: Symbol: __imp_funcexp
+READOBJX-NEXT: Symbol: funcexp
+READOBJX-NEXT: Symbol: __imp_aux_funcexp
+READOBJX-NEXT: Symbol: #funcexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: mangledfunc
+READOBJX-NEXT: Symbol: __imp_mangledfunc
+READOBJX-NEXT: Symbol: mangledfunc
+READOBJX-NEXT: Symbol: __imp_aux_mangledfunc
+READOBJX-NEXT: Symbol: #mangledfunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_aux_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@$$hYAHPEAX@Z
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: export as
+READOBJX-NEXT: Export name: expname
+READOBJX-NEXT: Symbol: __imp_expname
+READOBJX-NEXT: Symbol: expname
+READOBJX-NEXT: Symbol: __imp_aux_expname
+READOBJX-NEXT: Symbol: #expname
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: data
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: dataexp
+READOBJX-NEXT: Symbol: __imp_dataexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: funcexp
+READOBJX-NEXT: Symbol: __imp_funcexp
+READOBJX-NEXT: Symbol: funcexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: #mangledfunc
+READOBJX-NEXT: Symbol: __imp_#mangledfunc
+READOBJX-NEXT: Symbol: #mangledfunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: __imp_?test_cpp_func@@YAHPEAX@Z
+READOBJX-NEXT: Symbol: ?test_cpp_func@@YAHPEAX@Z
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: expname
+READOBJX-NEXT: Symbol: __imp_expname
+READOBJX-NEXT: Symbol: expname
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: data
+READOBJX-NEXT: Name type: name
+READOBJX-NEXT: Export name: dataexp
+READOBJX-NEXT: Symbol: __imp_dataexp
+
+
+RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2.lib
+RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAPX2 %s
+
+ARMAPX2:      Archive map
+ARMAPX2-NEXT: __IMPORT_DESCRIPTOR_test2 in test2.dll
+ARMAPX2-NEXT: __NULL_IMPORT_DESCRIPTOR in test2.dll
+ARMAPX2-NEXT: __imp_otherfunc in test2.dll
+ARMAPX2-NEXT: otherfunc in test2.dll
+ARMAPX2-NEXT: test2_NULL_THUNK_DATA in test2.dll
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: Archive EC map
+ARMAPX2-NEXT: #expname in test2.dll
+ARMAPX2-NEXT: #funcexp in test2.dll
+ARMAPX2-NEXT: #mangledfunc in test2.dll
+ARMAPX2-NEXT: ?test_cpp_func@@$$hYAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: ?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX@Z in test2.dll
+ARMAPX2-NEXT: __imp_aux_expname in test2.dll
+ARMAPX2-NEXT: __imp_aux_funcexp in test2.dll
+ARMAPX2-NEXT: __imp_aux_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_dataexp in test2.dll
+ARMAPX2-NEXT: __imp_expname in test2.dll
+ARMAPX2-NEXT: __imp_funcexp in test2.dll
+ARMAPX2-NEXT: __imp_mangledfunc in test2.dll
+ARMAPX2-NEXT: expname in test2.dll
+ARMAPX2-NEXT: funcexp in test2.dll
+ARMAPX2-NEXT: mangledfunc in test2.dll
+
+ARMAPX2:      test2.dll:
+ARMAPX2:      00000000 T #funcexp
+ARMAPX2-NEXT: 00000000 T __imp_aux_funcexp
+ARMAPX2-NEXT: 00000000 T __imp_funcexp
+ARMAPX2-NEXT: 00000000 T funcexp
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #mangledfunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_mangledfunc
+ARMAPX2-NEXT: 00000000 T __imp_mangledfunc
+ARMAPX2-NEXT: 00000000 T mangledfunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T ?test_cpp_func@@$$hYAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T ?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T __imp_?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-NEXT: 00000000 T __imp_aux_?test_cpp_func@@YAHPEAX@Z
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #expname
+ARMAPX2-NEXT: 00000000 T __imp_aux_expname
+ARMAPX2-NEXT: 00000000 T __imp_expname
+ARMAPX2-NEXT: 00000000 T expname
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 D __imp_dataexp
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T __imp_otherfunc
+ARMAPX2-NEXT: 00000000 T otherfunc
+
+
+RUN: llvm-lib -machine:arm64ec -def:exportas.def -out:exportas.lib
+RUN: llvm-nm --print-armap exportas.lib | FileCheck -check-prefix=EXPAS-ARMAP %s
+RUN: llvm-readobj exportas.lib | FileCheck -check-prefix=EXPAS-READOBJ %s
+
+EXPAS-ARMAP:      Archive EC map
+EXPAS-ARMAP-NEXT: #func1 in test.dll
+EXPAS-ARMAP-NEXT: #func2 in test.dll
+EXPAS-ARMAP-NEXT: #func3 in test.dll
+EXPAS-ARMAP-NEXT: #func4 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func3 in test.dll
+EXPAS-ARMAP-NEXT: __imp_aux_func4 in test.dll
+EXPAS-ARMAP-NEXT: __imp_data1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_data2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func1 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func2 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func3 in test.dll
+EXPAS-ARMAP-NEXT: __imp_func4 in test.dll
+EXPAS-ARMAP-NEXT: func1 in test.dll
+EXPAS-ARMAP-NEXT: func2 in test.dll
+EXPAS-ARMAP-NEXT: func3 in test.dll
+EXPAS-ARMAP-NEXT: func4 in test.dll
+
+EXPAS-READOBJ:      File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: func1
+EXPAS-READOBJ-NEXT: Symbol: __imp_func1
+EXPAS-READOBJ-NEXT: Symbol: func1
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func1
+EXPAS-READOBJ-NEXT: Symbol: #func1
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: func2
+EXPAS-READOBJ-NEXT: Symbol: __imp_func2
+EXPAS-READOBJ-NEXT: Symbol: func2
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func2
+EXPAS-READOBJ-NEXT: Symbol: #func2
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #func3
+EXPAS-READOBJ-NEXT: Symbol: __imp_func3
+EXPAS-READOBJ-NEXT: Symbol: func3
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func3
+EXPAS-READOBJ-NEXT: Symbol: #func3
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: code
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #func4
+EXPAS-READOBJ-NEXT: Symbol: __imp_func4
+EXPAS-READOBJ-NEXT: Symbol: func4
+EXPAS-READOBJ-NEXT: Symbol: __imp_aux_func4
+EXPAS-READOBJ-NEXT: Symbol: #func4
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: data
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: #data1
+EXPAS-READOBJ-NEXT: Symbol: __imp_data1
+EXPAS-READOBJ-EMPTY:
+EXPAS-READOBJ-NEXT: File: test.dll
+EXPAS-READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+EXPAS-READOBJ-NEXT: Type: data
+EXPAS-READOBJ-NEXT: Name type: export as
+EXPAS-READOBJ-NEXT: Export name: data2
+EXPAS-READOBJ-NEXT: Symbol: __imp_data2
+
+
 #--- test.def
 LIBRARY test.dll
 EXPORTS
     funcexp
+    #mangledfunc
+    ?test_cpp_func@@YAHPEAX@Z
+    expname=impname
     dataexp DATA
+
+#--- test2.def
+LIBRARY test2.dll
+EXPORTS
+    otherfunc
+
+#--- exportas.def
+LIBRARY test.dll
+EXPORTS
+    #func1 EXPORTAS func1
+    func2 EXPORTAS func2
+    func3 EXPORTAS #func3
+    #func4 EXPORTAS #func4
+    data1 DATA EXPORTAS #data1
+    #data2 DATA EXPORTAS data2
diff --git a/llvm/test/tools/llvm-lib/exportas.test b/llvm/test/tools/llvm-lib/exportas.test
new file mode 100644
index 000000000000..f6e845ca1746
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/exportas.test
@@ -0,0 +1,94 @@
+Test EXPORTAS in importlibs.
+
+RUN: split-file %s %t.dir && cd %t.dir
+RUN: llvm-lib -machine:amd64 -def:test.def -out:test.lib
+
+RUN: llvm-nm --print-armap test.lib | FileCheck --check-prefix=ARMAP %s
+
+ARMAP:      Archive map
+ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAP-NEXT: __imp_func in test.dll
+ARMAP-NEXT: __imp_func2 in test.dll
+ARMAP-NEXT: __imp_func3 in test.dll
+ARMAP-NEXT: __imp_mydata in test.dll
+ARMAP-NEXT: func in test.dll
+ARMAP-NEXT: func2 in test.dll
+ARMAP-NEXT: func3 in test.dll
+ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
+
+RUN: llvm-readobj test.lib | FileCheck --check-prefix=READOBJ %s
+
+READOBJ:      File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-x86-64
+READOBJ-NEXT: Arch: x86_64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc
+READOBJ-NEXT: Symbol: __imp_func
+READOBJ-NEXT: Symbol: func
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: data
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expdata
+READOBJ-NEXT: Symbol: __imp_mydata
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc2
+READOBJ-NEXT: Symbol: __imp_func2
+READOBJ-NEXT: Symbol: func2
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-x86-64
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: export as
+READOBJ-NEXT: Export name: expfunc3
+READOBJ-NEXT: Symbol: __imp_func3
+READOBJ-NEXT: Symbol: func3
+
+
+EXPORTAS must be at the end of entry declaration.
+RUN: not llvm-lib -machine:amd64 -def:test2.def -out:test2.lib 2>&1 \
+RUN:              | FileCheck --check-prefix=ERROR %s
+RUN: not llvm-lib -machine:amd64 -def:test3.def -out:test3.lib 2>&1 \
+RUN:              | FileCheck --check-prefix=ERROR %s
+ERROR: Invalid data was encountered while parsing the file
+
+
+#--- test.def
+LIBRARY test.dll
+EXPORTS
+        func EXPORTAS expfunc
+        mydata DATA EXPORTAS expdata
+        func2 = myfunc2 EXPORTAS expfunc2
+        func3 = otherdll.otherfunc3 EXPORTAS expfunc3
+
+#--- test2.def
+LIBRARY test.dll
+EXPORTS
+        func EXPORTAS expfunc
+        mydata EXPORTAS expdata DATA
+
+#--- test3.def
+LIBRARY test.dll
+EXPORTS
+        mydata EXPORTAS
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s
new file mode 100644
index 000000000000..d44eb55ebf75
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/cascade-fma.s
@@ -0,0 +1,53 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu < %s | FileCheck %s
+
+# Test XiangShan FuDian's cascade FMA, CPI = 3
+fmadd.s fa0, fa1, fa2, fa0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.25                        fmadd.s	fa0, fa1, fa2, fa0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -     fmadd.s	fa0, fa1, fa2, fa0
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s
new file mode 100644
index 000000000000..677fece1535a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/gpr-bypass.s
@@ -0,0 +1,527 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu -timeline \
+# RUN:   -timeline-max-cycles=1000 -iterations=1 < %s | FileCheck %s
+
+lui a0, 1
+auipc a1, 1
+add a0, a0, a1
+addi a0, a0, 1
+addw a0, a0, a0
+addiw a0, a0, 1
+sub a0, a0, a0
+subw a0, a0, a0
+and a0, a0, a0
+andi a0, a0, 1
+or a0, a0, a0
+ori a0, a0, 1
+xor a0, a0, a0
+xori a0, a0, 1
+sll a0, a0, a0
+slli a0, a0, 1
+sllw a0, a0, a0
+slliw a0, a0, 1
+srl a0, a0, a0
+srli a0, a0, 1
+srlw a0, a0, a0
+srliw a0, a0, 1
+sra a0, a0, a0
+srai a0, a0, 1
+sraw a0, a0, a0
+sraiw a0, a0, 1
+slt a0, a0, a0
+slti a0, a0, 1
+sltu a0, a0, a0
+sltiu a0, a0, 1
+mul a0, a0, a0
+add a0, a0, a0
+mulw a0, a0, a0
+add a0, a0, a0
+beq a0, a0, 1f
+1:
+add a0, a0, a0
+bne a0, a0, 1f
+1:
+add a0, a0, a0
+blt a0, a0, 1f
+1:
+add a0, a0, a0
+bltu a0, a0, 1f
+1:
+add a0, a0, a0
+bge a0, a0, 1f
+1:
+add a0, a0, a0
+bgeu a0, a0, 1f
+1:
+# zba
+add.uw a0, a0, a0
+slli.uw a0, a0, 1
+sh1add.uw a0, a0, a0
+sh2add.uw a0, a0, a0
+sh3add.uw a0, a0, a0
+sh1add a0, a0, a0
+sh2add a0, a0, a0
+sh3add a0, a0, a0
+# zbb
+andn a0, a0, a0
+orn a0, a0, a0
+xnor a0, a0, a0
+sext.b a0, a0
+sext.h a0, a0
+zext.h a0, a0
+min a0, a0, a0
+minu a0, a0, a0
+max a0, a0, a0
+maxu a0, a0, a0
+rol a0, a0, a0
+ror a0, a0, a0
+rori a0, a0, 1
+clz a0, a0
+clzw a0, a0
+ctz a0, a0
+ctzw a0, a0
+cpop a0, a0
+add a0, a0, a0
+cpopw a0, a0
+add a0, a0, a0
+rev8 a0, a0
+orc.b a0, a0
+lb a0, 0(a0)
+add a0, a0, a0
+lh a0, 0(a0)
+and a0, a0, a0
+lw a0, 0(a0)
+or a0, a0, a0
+ld a0, 0(a0)
+xor a0, a0, a0
+lbu a0, 0(a0)
+addi a0, a0, 1
+lhu a0, 0(a0)
+sub a0, a0, a0
+lwu a0, 0(a0)
+addw a0, a0, a0
+jr a0
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      91
+# CHECK-NEXT: Total Cycles:      124
+# CHECK-NEXT: Total uOps:        91
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.73
+# CHECK-NEXT: IPC:               0.73
+# CHECK-NEXT: Block RThroughput: 17.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        lui	a0, 1
+# CHECK-NEXT:  1      1     0.25                        auipc	a1, 1
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a1
+# CHECK-NEXT:  1      1     0.25                        addi	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        addw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        addiw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sub	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        subw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        and	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        andi	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        or	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        ori	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        xor	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        xori	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sll	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slli	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sllw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slliw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        srl	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srli	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        srlw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srliw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sra	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        srai	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sraw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sraiw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        slt	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slti	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sltu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        seqz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        mul	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      3     0.50                        mulw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        beq	a0, a0, .Ltmp0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bne	a0, a0, .Ltmp1
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        blt	a0, a0, .Ltmp2
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bltu	a0, a0, .Ltmp3
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bge	a0, a0, .Ltmp4
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT:  1      1     0.25                        add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        slli.uw	a0, a0, 1
+# CHECK-NEXT:  1      1     0.25                        sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh1add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh2add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sh3add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        andn	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        orn	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        xnor	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        sext.b	a0, a0
+# CHECK-NEXT:  1      1     0.25                        sext.h	a0, a0
+# CHECK-NEXT:  1      1     0.25                        zext.h	a0, a0
+# CHECK-NEXT:  1      1     0.25                        min	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        minu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        max	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        maxu	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rol	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        ror	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rori	a0, a0, 1
+# CHECK-NEXT:  1      3     0.50                        clz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        clzw	a0, a0
+# CHECK-NEXT:  1      3     0.50                        ctz	a0, a0
+# CHECK-NEXT:  1      3     0.50                        ctzw	a0, a0
+# CHECK-NEXT:  1      3     0.50                        cpop	a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      3     0.50                        cpopw	a0, a0
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      1     0.25                        rev8	a0, a0
+# CHECK-NEXT:  1      1     0.25                        orc.b	a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lb	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        add	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lh	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        and	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lw	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        or	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   ld	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        xor	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lbu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addi	a0, a0, 1
+# CHECK-NEXT:  1      5     0.50    *                   lhu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        sub	a0, a0, a0
+# CHECK-NEXT:  1      5     0.50    *                   lwu	a0, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addw	a0, a0, a0
+# CHECK-NEXT:  1      1     1.00                        jr	a0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT: 17.00  17.00  17.00  18.00   -      -      -      -      -      -     3.00   4.00   4.00   4.00   7.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     lui	a0, 1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     auipc	a1, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addiw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subw	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     andi	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ori	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     xori	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sll	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slli	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sllw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     slliw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srl	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srli	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     srlw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     srliw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sra	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     srai	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sraw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sraiw	a0, a0, 1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slt	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slti	a0, a0, 1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sltu	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     seqz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     mul	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     mulw	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     beq	a0, a0, .Ltmp0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bne	a0, a0, .Ltmp1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     blt	a0, a0, .Ltmp2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bge	a0, a0, .Ltmp4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     add.uw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     slli.uw	a0, a0, 1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh1add	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sh2add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sh3add	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     andn	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     orn	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xnor	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sext.b	a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sext.h	a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     zext.h	a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     min	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     max	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     rol	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     ror	a0, a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     rori	a0, a0, 1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     clz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     clzw	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     ctz	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     ctzw	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     cpop	a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     cpopw	a0, a0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     rev8	a0, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     orc.b	a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lb	a0, 0(a0)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     lh	a0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lw	a0, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     ld	a0, 0(a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lbu	a0, 0(a0)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     lhu	a0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     lwu	a0, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     jr	a0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   lui	a0, 1
+# CHECK-NEXT: [0,1]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   auipc	a1, 1
+# CHECK-NEXT: [0,2]     D=eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a1
+# CHECK-NEXT: [0,3]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addi	a0, a0, 1
+# CHECK-NEXT: [0,4]     D===eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addw	a0, a0, a0
+# CHECK-NEXT: [0,5]     D====eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   addiw	a0, a0, 1
+# CHECK-NEXT: [0,6]     .D====eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sub	a0, a0, a0
+# CHECK-NEXT: [0,7]     .D=====eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   subw	a0, a0, a0
+# CHECK-NEXT: [0,8]     .D======eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   and	a0, a0, a0
+# CHECK-NEXT: [0,9]     .D=======eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   andi	a0, a0, 1
+# CHECK-NEXT: [0,10]    .D========eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   or	a0, a0, a0
+# CHECK-NEXT: [0,11]    .D=========eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   ori	a0, a0, 1
+# CHECK-NEXT: [0,12]    . D=========eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xor	a0, a0, a0
+# CHECK-NEXT: [0,13]    . D==========eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xori	a0, a0, 1
+# CHECK-NEXT: [0,14]    . D===========eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sll	a0, a0, a0
+# CHECK-NEXT: [0,15]    . D============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slli	a0, a0, 1
+# CHECK-NEXT: [0,16]    . D=============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sllw	a0, a0, a0
+# CHECK-NEXT: [0,17]    . D==============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slliw	a0, a0, 1
+# CHECK-NEXT: [0,18]    .  D==============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srl	a0, a0, a0
+# CHECK-NEXT: [0,19]    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srli	a0, a0, 1
+# CHECK-NEXT: [0,20]    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srlw	a0, a0, a0
+# CHECK-NEXT: [0,21]    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srliw	a0, a0, 1
+# CHECK-NEXT: [0,22]    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sra	a0, a0, a0
+# CHECK-NEXT: [0,23]    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   srai	a0, a0, 1
+# CHECK-NEXT: [0,24]    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sraw	a0, a0, a0
+# CHECK-NEXT: [0,25]    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sraiw	a0, a0, 1
+# CHECK-NEXT: [0,26]    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slt	a0, a0, a0
+# CHECK-NEXT: [0,27]    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slti	a0, a0, 1
+# CHECK-NEXT: [0,28]    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sltu	a0, a0, a0
+# CHECK-NEXT: [0,29]    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   seqz	a0, a0
+# CHECK-NEXT: [0,30]    .    .    .  D================eeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   mul	a0, a0, a0
+# CHECK-NEXT: [0,31]    .    .    .   D==================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,32]    .    .    .   D===================eeeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   mulw	a0, a0, a0
+# CHECK-NEXT: [0,33]    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,34]    .    .    .    D======================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   beq	a0, a0, .Ltmp0
+# CHECK-NEXT: [0,35]    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,36]    .    .    .    .D======================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bne	a0, a0, .Ltmp1
+# CHECK-NEXT: [0,37]    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,38]    .    .    .    . D======================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   blt	a0, a0, .Ltmp2
+# CHECK-NEXT: [0,39]    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,40]    .    .    .    .  D======================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: [0,41]    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,42]    .    .    .    .   D======================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bge	a0, a0, .Ltmp4
+# CHECK-NEXT: [0,43]    .    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,44]    .    .    .    .    D======================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT: [0,45]    .    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   add.uw	a0, a0, a0
+# CHECK-NEXT: [0,46]    .    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   slli.uw	a0, a0, 1
+# CHECK-NEXT: [0,47]    .    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh1add.uw	a0, a0, a0
+# CHECK-NEXT: [0,48]    .    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh2add.uw	a0, a0, a0
+# CHECK-NEXT: [0,49]    .    .    .    .    .    D=====================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh3add.uw	a0, a0, a0
+# CHECK-NEXT: [0,50]    .    .    .    .    .    .D=====================eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh1add	a0, a0, a0
+# CHECK-NEXT: [0,51]    .    .    .    .    .    . D=====================eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh2add	a0, a0, a0
+# CHECK-NEXT: [0,52]    .    .    .    .    .    .  D=====================eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   sh3add	a0, a0, a0
+# CHECK-NEXT: [0,53]    .    .    .    .    .    .   D=====================eER .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   andn	a0, a0, a0
+# CHECK-NEXT: [0,54]    .    .    .    .    .    .    .  D==================eER.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   orn	a0, a0, a0
+# CHECK-NEXT: [0,55]    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .    .  .   xnor	a0, a0, a0
+# CHECK-NEXT: [0,56]    .    .    .    .    .    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .    .  .   sext.b	a0, a0
+# CHECK-NEXT: [0,57]    .    .    .    .    .    .    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .    .  .   sext.h	a0, a0
+# CHECK-NEXT: [0,58]    .    .    .    .    .    .    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .    .  .   zext.h	a0, a0
+# CHECK-NEXT: [0,59]    .    .    .    .    .    .    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .    .  .   min	a0, a0, a0
+# CHECK-NEXT: [0,60]    .    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .    .  .   minu	a0, a0, a0
+# CHECK-NEXT: [0,61]    .    .    .    .    .    .    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .    .    .    .  .   max	a0, a0, a0
+# CHECK-NEXT: [0,62]    .    .    .    .    .    .    .    .    .   D===============eER  .    .    .    .    .    .    .    .    .    .    .    .  .   maxu	a0, a0, a0
+# CHECK-NEXT: [0,63]    .    .    .    .    .    .    .    .    .    D===============eER .    .    .    .    .    .    .    .    .    .    .    .  .   rol	a0, a0, a0
+# CHECK-NEXT: [0,64]    .    .    .    .    .    .    .    .    .    .D===============eER.    .    .    .    .    .    .    .    .    .    .    .  .   ror	a0, a0, a0
+# CHECK-NEXT: [0,65]    .    .    .    .    .    .    .    .    .    . D===============eER    .    .    .    .    .    .    .    .    .    .    .  .   rori	a0, a0, 1
+# CHECK-NEXT: [0,66]    .    .    .    .    .    .    .    .    .    . D================eeeER .    .    .    .    .    .    .    .    .    .    .  .   clz	a0, a0
+# CHECK-NEXT: [0,67]    .    .    .    .    .    .    .    .    .    . D===================eeeER   .    .    .    .    .    .    .    .    .    .  .   clzw	a0, a0
+# CHECK-NEXT: [0,68]    .    .    .    .    .    .    .    .    .    . D======================eeeER.    .    .    .    .    .    .    .    .    .  .   ctz	a0, a0
+# CHECK-NEXT: [0,69]    .    .    .    .    .    .    .    .    .    . D=========================eeeER  .    .    .    .    .    .    .    .    .  .   ctzw	a0, a0
+# CHECK-NEXT: [0,70]    .    .    .    .    .    .    .    .    .    . D============================eeeER    .    .    .    .    .    .    .    .  .   cpop	a0, a0
+# CHECK-NEXT: [0,71]    .    .    .    .    .    .    .    .    .    .  D==============================eER   .    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,72]    .    .    .    .    .    .    .    .    .    .  D===============================eeeER.    .    .    .    .    .    .    .  .   cpopw	a0, a0
+# CHECK-NEXT: [0,73]    .    .    .    .    .    .    .    .    .    .   D=================================eER    .    .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,74]    .    .    .    .    .    .    .    .    .    .    D=================================eER   .    .    .    .    .    .    .  .   rev8	a0, a0
+# CHECK-NEXT: [0,75]    .    .    .    .    .    .    .    .    .    .    .D=================================eER  .    .    .    .    .    .    .  .   orc.b	a0, a0
+# CHECK-NEXT: [0,76]    .    .    .    .    .    .    .    .    .    .    .D==================================eeeeeER  .    .    .    .    .    .  .   lb	a0, 0(a0)
+# CHECK-NEXT: [0,77]    .    .    .    .    .    .    .    .    .    .    . D=====================================eER  .    .    .    .    .    .  .   add	a0, a0, a0
+# CHECK-NEXT: [0,78]    .    .    .    .    .    .    .    .    .    .    . D======================================eeeeeER  .    .    .    .    .  .   lh	a0, 0(a0)
+# CHECK-NEXT: [0,79]    .    .    .    .    .    .    .    .    .    .    .  D=========================================eER  .    .    .    .    .  .   and	a0, a0, a0
+# CHECK-NEXT: [0,80]    .    .    .    .    .    .    .    .    .    .    .  D==========================================eeeeeER  .    .    .    .  .   lw	a0, 0(a0)
+# CHECK-NEXT: [0,81]    .    .    .    .    .    .    .    .    .    .    .   D=============================================eER  .    .    .    .  .   or	a0, a0, a0
+# CHECK-NEXT: [0,82]    .    .    .    .    .    .    .    .    .    .    .   D==============================================eeeeeER  .    .    .  .   ld	a0, 0(a0)
+# CHECK-NEXT: [0,83]    .    .    .    .    .    .    .    .    .    .    .    D=================================================eER  .    .    .  .   xor	a0, a0, a0
+# CHECK-NEXT: [0,84]    .    .    .    .    .    .    .    .    .    .    .    D==================================================eeeeeER  .    .  .   lbu	a0, 0(a0)
+# CHECK-NEXT: [0,85]    .    .    .    .    .    .    .    .    .    .    .    .D=====================================================eER  .    .  .   addi	a0, a0, 1
+# CHECK-NEXT: [0,86]    .    .    .    .    .    .    .    .    .    .    .    .D======================================================eeeeeER  .  .   lhu	a0, 0(a0)
+# CHECK-NEXT: [0,87]    .    .    .    .    .    .    .    .    .    .    .    . D=========================================================eER  .  .   sub	a0, a0, a0
+# CHECK-NEXT: [0,88]    .    .    .    .    .    .    .    .    .    .    .    . D==========================================================eeeeeER.   lwu	a0, 0(a0)
+# CHECK-NEXT: [0,89]    .    .    .    .    .    .    .    .    .    .    .    .  D=============================================================eER.   addw	a0, a0, a0
+# CHECK-NEXT: [0,90]    .    .    .    .    .    .    .    .    .    .    .    .  D==============================================================eER   jr	a0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       lui	a0, 1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       auipc	a1, 1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       add	a0, a0, a1
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       addi	a0, a0, 1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       addw	a0, a0, a0
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       addiw	a0, a0, 1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       sub	a0, a0, a0
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       subw	a0, a0, a0
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       and	a0, a0, a0
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       andi	a0, a0, 1
+# CHECK-NEXT: 10.    1     9.0    0.0    0.0       or	a0, a0, a0
+# CHECK-NEXT: 11.    1     10.0   0.0    0.0       ori	a0, a0, 1
+# CHECK-NEXT: 12.    1     10.0   0.0    0.0       xor	a0, a0, a0
+# CHECK-NEXT: 13.    1     11.0   0.0    0.0       xori	a0, a0, 1
+# CHECK-NEXT: 14.    1     12.0   0.0    0.0       sll	a0, a0, a0
+# CHECK-NEXT: 15.    1     13.0   0.0    0.0       slli	a0, a0, 1
+# CHECK-NEXT: 16.    1     14.0   0.0    0.0       sllw	a0, a0, a0
+# CHECK-NEXT: 17.    1     15.0   0.0    0.0       slliw	a0, a0, 1
+# CHECK-NEXT: 18.    1     15.0   0.0    0.0       srl	a0, a0, a0
+# CHECK-NEXT: 19.    1     16.0   0.0    0.0       srli	a0, a0, 1
+# CHECK-NEXT: 20.    1     16.0   0.0    0.0       srlw	a0, a0, a0
+# CHECK-NEXT: 21.    1     16.0   0.0    0.0       srliw	a0, a0, 1
+# CHECK-NEXT: 22.    1     16.0   0.0    0.0       sra	a0, a0, a0
+# CHECK-NEXT: 23.    1     16.0   0.0    0.0       srai	a0, a0, 1
+# CHECK-NEXT: 24.    1     16.0   0.0    0.0       sraw	a0, a0, a0
+# CHECK-NEXT: 25.    1     16.0   0.0    0.0       sraiw	a0, a0, 1
+# CHECK-NEXT: 26.    1     16.0   0.0    0.0       slt	a0, a0, a0
+# CHECK-NEXT: 27.    1     16.0   0.0    0.0       slti	a0, a0, 1
+# CHECK-NEXT: 28.    1     16.0   0.0    0.0       sltu	a0, a0, a0
+# CHECK-NEXT: 29.    1     16.0   0.0    0.0       seqz	a0, a0
+# CHECK-NEXT: 30.    1     17.0   0.0    0.0       mul	a0, a0, a0
+# CHECK-NEXT: 31.    1     19.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 32.    1     20.0   0.0    0.0       mulw	a0, a0, a0
+# CHECK-NEXT: 33.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 34.    1     23.0   0.0    0.0       beq	a0, a0, .Ltmp0
+# CHECK-NEXT: 35.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 36.    1     23.0   0.0    0.0       bne	a0, a0, .Ltmp1
+# CHECK-NEXT: 37.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 38.    1     23.0   0.0    0.0       blt	a0, a0, .Ltmp2
+# CHECK-NEXT: 39.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 40.    1     23.0   0.0    0.0       bltu	a0, a0, .Ltmp3
+# CHECK-NEXT: 41.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 42.    1     23.0   0.0    0.0       bge	a0, a0, .Ltmp4
+# CHECK-NEXT: 43.    1     22.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 44.    1     23.0   0.0    0.0       bgeu	a0, a0, .Ltmp5
+# CHECK-NEXT: 45.    1     22.0   0.0    0.0       add.uw	a0, a0, a0
+# CHECK-NEXT: 46.    1     22.0   0.0    0.0       slli.uw	a0, a0, 1
+# CHECK-NEXT: 47.    1     22.0   0.0    0.0       sh1add.uw	a0, a0, a0
+# CHECK-NEXT: 48.    1     22.0   0.0    0.0       sh2add.uw	a0, a0, a0
+# CHECK-NEXT: 49.    1     22.0   0.0    0.0       sh3add.uw	a0, a0, a0
+# CHECK-NEXT: 50.    1     22.0   0.0    0.0       sh1add	a0, a0, a0
+# CHECK-NEXT: 51.    1     22.0   0.0    0.0       sh2add	a0, a0, a0
+# CHECK-NEXT: 52.    1     22.0   0.0    0.0       sh3add	a0, a0, a0
+# CHECK-NEXT: 53.    1     22.0   0.0    0.0       andn	a0, a0, a0
+# CHECK-NEXT: 54.    1     19.0   0.0    0.0       orn	a0, a0, a0
+# CHECK-NEXT: 55.    1     16.0   0.0    0.0       xnor	a0, a0, a0
+# CHECK-NEXT: 56.    1     16.0   0.0    0.0       sext.b	a0, a0
+# CHECK-NEXT: 57.    1     16.0   0.0    0.0       sext.h	a0, a0
+# CHECK-NEXT: 58.    1     16.0   0.0    0.0       zext.h	a0, a0
+# CHECK-NEXT: 59.    1     16.0   0.0    0.0       min	a0, a0, a0
+# CHECK-NEXT: 60.    1     16.0   0.0    0.0       minu	a0, a0, a0
+# CHECK-NEXT: 61.    1     16.0   0.0    0.0       max	a0, a0, a0
+# CHECK-NEXT: 62.    1     16.0   0.0    0.0       maxu	a0, a0, a0
+# CHECK-NEXT: 63.    1     16.0   0.0    0.0       rol	a0, a0, a0
+# CHECK-NEXT: 64.    1     16.0   0.0    0.0       ror	a0, a0, a0
+# CHECK-NEXT: 65.    1     16.0   0.0    0.0       rori	a0, a0, 1
+# CHECK-NEXT: 66.    1     17.0   0.0    0.0       clz	a0, a0
+# CHECK-NEXT: 67.    1     20.0   0.0    0.0       clzw	a0, a0
+# CHECK-NEXT: 68.    1     23.0   0.0    0.0       ctz	a0, a0
+# CHECK-NEXT: 69.    1     26.0   0.0    0.0       ctzw	a0, a0
+# CHECK-NEXT: 70.    1     29.0   0.0    0.0       cpop	a0, a0
+# CHECK-NEXT: 71.    1     31.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 72.    1     32.0   0.0    0.0       cpopw	a0, a0
+# CHECK-NEXT: 73.    1     34.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 74.    1     34.0   0.0    0.0       rev8	a0, a0
+# CHECK-NEXT: 75.    1     34.0   0.0    0.0       orc.b	a0, a0
+# CHECK-NEXT: 76.    1     35.0   0.0    0.0       lb	a0, 0(a0)
+# CHECK-NEXT: 77.    1     38.0   0.0    0.0       add	a0, a0, a0
+# CHECK-NEXT: 78.    1     39.0   0.0    0.0       lh	a0, 0(a0)
+# CHECK-NEXT: 79.    1     42.0   0.0    0.0       and	a0, a0, a0
+# CHECK-NEXT: 80.    1     43.0   0.0    0.0       lw	a0, 0(a0)
+# CHECK-NEXT: 81.    1     46.0   0.0    0.0       or	a0, a0, a0
+# CHECK-NEXT: 82.    1     47.0   0.0    0.0       ld	a0, 0(a0)
+# CHECK-NEXT: 83.    1     50.0   0.0    0.0       xor	a0, a0, a0
+# CHECK-NEXT: 84.    1     51.0   0.0    0.0       lbu	a0, 0(a0)
+# CHECK-NEXT: 85.    1     54.0   0.0    0.0       addi	a0, a0, 1
+# CHECK-NEXT: 86.    1     55.0   0.0    0.0       lhu	a0, 0(a0)
+# CHECK-NEXT: 87.    1     58.0   0.0    0.0       sub	a0, a0, a0
+# CHECK-NEXT: 88.    1     59.0   0.0    0.0       lwu	a0, 0(a0)
+# CHECK-NEXT: 89.    1     62.0   0.0    0.0       addw	a0, a0, a0
+# CHECK-NEXT: 90.    1     63.0   0.0    0.0       jr	a0
+# CHECK-NEXT:        1     22.7   0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s b/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s
new file mode 100644
index 000000000000..e1925e7647e3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/XiangShan/load-to-alu.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu -timeline -iterations=1 < %s | FileCheck %s
+
+# Test XiangShan load to ALU (4 cycles)
+ld a1, 0(a0)
+addi a2, a1, 1
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   ld	a1, 0(a0)
+# CHECK-NEXT:  1      1     0.25                        addi	a2, a1, 1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - XS2ALU
+# CHECK-NEXT: [0.1] - XS2ALU
+# CHECK-NEXT: [0.2] - XS2ALU
+# CHECK-NEXT: [0.3] - XS2ALU
+# CHECK-NEXT: [1.0] - XS2FMAC
+# CHECK-NEXT: [1.1] - XS2FMAC
+# CHECK-NEXT: [1.2] - XS2FMAC
+# CHECK-NEXT: [1.3] - XS2FMAC
+# CHECK-NEXT: [2.0] - XS2FMISC
+# CHECK-NEXT: [2.1] - XS2FMISC
+# CHECK-NEXT: [3.0] - XS2LD
+# CHECK-NEXT: [3.1] - XS2LD
+# CHECK-NEXT: [4.0] - XS2MDU
+# CHECK-NEXT: [4.1] - XS2MDU
+# CHECK-NEXT: [5]   - XS2MISC
+# CHECK-NEXT: [6.0] - XS2ST
+# CHECK-NEXT: [6.1] - XS2ST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [0.3]  [1.0]  [1.1]  [1.2]  [1.3]  [2.0]  [2.1]  [3.0]  [3.1]  [4.0]  [4.1]  [5]    [6.0]  [6.1]  Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     ld	a1, 0(a0)
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addi	a2, a1, 1
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeeeER   ld	a1, 0(a0)
+# CHECK-NEXT: [0,1]     D====eER   addi	a2, a1, 1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld	a1, 0(a0)
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       addi	a2, a1, 1
+# CHECK-NEXT:        1     3.0    0.5    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
index 6d33fdb3359b..5c12c520b04a 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -1084,24 +1084,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      4     1.00                        kshiftlw	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kshiftrw	$2, %k1, %k2
 # CHECK-NEXT:  1      4     1.00                        kunpckbw	%k0, %k1, %k2
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      10    1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      10    1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19
@@ -1510,24 +1510,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  2      1     0.50           *            vmovups	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovups	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovups	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                        vpabsd	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsd	(%rax), %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vpabsd	(%rax){1to16}, %zmm19
@@ -1958,24 +1958,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      4     1.00                        vptestnmq	%zmm0, %zmm1, %k2 {%k3}
 # CHECK-NEXT:  2      11    1.00    *                   vptestnmq	(%rax), %zmm1, %k2 {%k3}
 # CHECK-NEXT:  2      11    1.00    *                   vptestnmq	(%rax){1to8}, %zmm1, %k2 {%k3}
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      2     1.00                        vucomiss	%xmm16, %xmm17
 # CHECK-NEXT:  2      7     1.00    *                   vucomiss	(%rax), %xmm17
 # CHECK-NEXT:  1      1     1.00                        vunpckhpd	%zmm16, %zmm17, %zmm19
@@ -2031,7 +2031,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 371.17 99.67  327.50 327.50 8.00   612.17 2.00   8.00   8.00   8.00
+# CHECK-NEXT:  -     612.00 398.17 99.67  327.50 327.50 8.00   585.17 2.00   8.00   8.00   8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2044,24 +2044,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kshiftlw	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kshiftrw	$2, %k1, %k2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     kunpckbw	%k0, %k1, %k2
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     valignd	$1, (%rax){1to16}, %zmm17, %zmm19
@@ -2470,24 +2470,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovups	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovups	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovups	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmulps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vmulps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vpabsd	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vpabsd	(%rax), %zmm19
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vpabsd	(%rax){1to16}, %zmm19
@@ -2918,24 +2918,24 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vptestnmq	%zmm0, %zmm1, %k2 {%k3}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vptestnmq	(%rax), %zmm1, %k2 {%k3}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vptestnmq	(%rax){1to8}, %zmm1, %k2 {%k3}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubpd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubpd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vsubps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vsubps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vucomiss	%xmm16, %xmm17
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vucomiss	(%rax), %xmm17
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vunpckhpd	%zmm16, %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
index 4da4cebd6099..42041cd389fc 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
@@ -753,30 +753,30 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  3      15    3.00                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     0.50                        vrangesd	$ab, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  2      10    0.50    *                   vrangesd	$ab, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      4     0.50                        vrangesd	$ab, %xmm16, %xmm17, %xmm19 {%k1}
@@ -872,7 +872,7 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     197.25 73.25  102.00 102.00 5.50   209.25 0.25   5.50   5.50   5.50
+# CHECK-NEXT:  -      -     209.25 73.25  102.00 102.00 5.50   197.25 0.25   5.50   5.50   5.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -1162,30 +1162,30 @@ vxorps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1}
 # CHECK-NEXT:  -      -     3.00    -      -      -      -      -      -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangepd	$ab, (%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangepd	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, %zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vrangeps	$ab, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vrangeps	$ab, {sae}, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vrangesd	$ab, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vrangesd	$ab, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vrangesd	$ab, %xmm16, %xmm17, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-nm/wasm/linked.yaml b/llvm/test/tools/llvm-nm/wasm/linked.yaml
index 992c1811743b..6aee4b9fc184 100644
--- a/llvm/test/tools/llvm-nm/wasm/linked.yaml
+++ b/llvm/test/tools/llvm-nm/wasm/linked.yaml
@@ -1,10 +1,15 @@
 # RUN: yaml2obj %s -o %t.wasm
 # RUN: llvm-nm %t.wasm | FileCheck %s
+# RUN: llvm-nm -P %t.wasm | FileCheck %s --check-prefix=POSIX
 
 # CHECK: 0000009f T my_func_export
 # CHECK-NEXT: 0000002a D my_global_export
 # CHECK-NEXT: 00000000 D my_table_export
 
+# POSIX: my_func_export T 9f 3
+# POSIX-NEXT: my_global_export D 2a 0
+# POSIX-NEXT: my_table_export D 0 0
+
 --- !WASM
 FileHeader:
   Version:         0x1
diff --git a/llvm/test/tools/llvm-nm/wasm/print-size.test b/llvm/test/tools/llvm-nm/wasm/print-size.test
index c166edb4641c..610929b959b5 100644
--- a/llvm/test/tools/llvm-nm/wasm/print-size.test
+++ b/llvm/test/tools/llvm-nm/wasm/print-size.test
@@ -43,4 +43,4 @@ Sections:
         Size:            32
 
 # CHECK: 00000000 00000020 D a_data_symbol
-# CHECK: 00000001 00000000 T a_func
+# CHECK: 00000001 0000000d T a_func
diff --git a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
index 09ff8ae951d4..6c07f9f88bda 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/ihex-writer.test
@@ -70,8 +70,8 @@
 # SIGN_EXTENDED-NEXT: :051000000001020304E1
 # SIGN_EXTENDED-NEXT: :00000001FF
 
-# BAD-ADDR: error: {{.*}}: Section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
-# BAD-ADDR2: error: {{.*}}: Section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
+# BAD-ADDR: error: {{.*}}: section '.text2' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
+# BAD-ADDR2: error: {{.*}}: section '.text3' address range [0x{{.*}}, 0x{{.*}}] is not 32 bit
 
 # There shouldn't be 'ExtendedAddr' nor 'Data' records
 # ZERO_SIZE_SEC-NOT:  :02000004
@@ -81,4 +81,4 @@
 # START1: :040000030000FFFFFB
 # START2: :0400000500100000E7
 # START3: :040000058000100067
-# BAD-START: error: {{.*}}: Entry point address 0x{{.*}} overflows 32 bits
+# BAD-START: error: {{.*}}: entry point address 0x{{.*}} overflows 32 bits
diff --git a/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test
new file mode 100644
index 000000000000..e96b87b85924
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/srec-writer.test
@@ -0,0 +1,196 @@
+## Check for basic functionality using an input file with
+## various section types, adresses, data, and no segments.
+# RUN: yaml2obj %s --docnum=1 -o %t
+# RUN: llvm-objcopy -O srec %t - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=SREC
+
+## The record type for the header should be S0 with a 2 byte address
+## of 0. For an output file named "-" the header data field should contain "2D".
+## The byte count field should therefore have a value of 4: 2 bytes for address,
+## 1 byte for output file and 1 byte for checksum.
+     # SREC:S00400002DCE
+# SREC-NEXT:S31500001000000102030405060708090A0B0C0D0E0F62
+# SREC-NEXT:S30A0000101010111213147B
+# SREC-NEXT:S30F00EFFFFF1111111111111111111159
+# SREC-NEXT:S31000FFFFF83031323334353637383940AC
+# SREC-NEXT:S30A8000100000010203045B
+# SREC-NEXT:S70500000000FA
+
+## Terminator should contain the entry point.
+# RUN: llvm-objcopy -O srec --set-start=0xF0000000 %t --only-section=.dummy - 2>&1 | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY
+## Sign-extended entry point is OK.
+# RUN: llvm-objcopy -O srec --set-start=0xFFFFFFFFF0000000 %t --only-section=.dummy - 2>&1 | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY
+
+     # ENTRY:S00400002DCE
+# ENTRY-NEXT:S705F00000000A
+
+## Start address which exceeds 32 bit range triggers an error.
+# RUN: not llvm-objcopy -O srec --set-start=0xF00000000 %t - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD_START
+
+# BAD_START: entry point address 0xf00000000 overflows 32 bits
+
+## Sign-extended start address which exceeds 32 bit range triggers an error.
+# RUN: not llvm-objcopy -O srec --set-start=0xFFFFFFFF0F000000 %t - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD_EXTENDED_START
+
+# BAD_EXTENDED_START: entry point address 0xffffffff0f000000 overflows 32 bits
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .data1
+## Records for this section should come last.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "11111111111111111111"
+    Address:         0xEFFFFF
+  - Name:            .data2
+## This section overlaps 24-bit address boundary, so we expect
+## its record type to be S3.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "3031323334353637383940"
+    Address:         0xFFFFF8
+## Sign-extended addresses are OK.
+  - Name:            .data3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0xFFFFFFFF80001000
+    Content:         "0001020304"
+  - Name:            .text
+## This section's contents exceed default line length of 16 bytes
+## so we expect two lines created for it. Records for this section
+## should appear before records for the previous section.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x1000
+    Content:         "000102030405060708090A0B0C0D0E0F1011121314"
+  - Name:            .bss
+## NOBITS sections are not written.
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x10100
+    Size:            0x1000
+  - Name:            .dummy
+## Non-allocatable sections are not written.
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Address:         0x20FFF8
+    Size:            65536
+
+## Check for various error cases.
+
+## Check that section address range overlapping 32 bit range
+## triggers an error.
+# RUN: yaml2obj %s --docnum=2 -o %t.err
+# RUN: not llvm-objcopy -O srec --only-section=.text1 %t.err - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD-ADDR
+# RUN: not llvm-objcopy -O srec --only-section=.text2 %t.err - 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=BAD-ADDR2
+
+# BAD-ADDR: section '.text1' address range [0xfffffff8, 0x100000000] is not 32 bit
+# BAD-ADDR2: section '.text2' address range [0xffffffff0, 0xffffffff4] is not 32 bit
+
+## Check that zero length section is not written.
+# RUN: llvm-objcopy -O srec --only-section=.text %t.err - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace --implicit-check-not={{.}} %s --check-prefix=ZERO_SIZE_SEC
+
+## There should be no records besides header and terminator.
+# ZERO_SIZE_SEC:S00400002DCE
+# ZERO_SIZE_SEC-NEXT:S9030000FC
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text1
+## Part of section data is in 32-bit address range and part isn't.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC]
+    Address:         0xFFFFFFF8
+    Content:         "000102030405060708"
+  - Name:            .text2
+  ## Entire secion is outside of 32-bit range.
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0xFFFFFFFF0
+    Content:         "0001020304"
+
+## This tests an input file with segments and expects
+## physical addresses instead of virtual addresses.
+# RUN: yaml2obj %s --docnum=3 -o %t.seg
+# RUN: llvm-objcopy -O srec %t.seg - | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=PADDR
+
+     # PADDR:S00400002DCE
+# PADDR-NEXT:S214100000000102030405060708090A0B0C0D0E0F63
+# PADDR-NEXT:S20910001010111213147C
+# PADDR-NEXT:S20F10001530313233343536373839407E
+# PADDR-NEXT:S20810002040414243C1
+# PADDR-NEXT:S20F10002450515253545556575859600F
+# PADDR-NEXT:S20720FFF8000000E1
+# PADDR-NEXT:S804100000EB
+
+--- !ELF
+## This file has a non-contiguous section layout with large gaps.
+## These sections are all tightly packed into one PT_LOAD segment
+## starting at physical address 0x100000. Records should use physical addresses.
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x100000
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0
+    Content:         "000102030405060708090A0B0C0D0E0F1011121314"
+  - Name:            .data1
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "3031323334353637383940"
+    Address:         0xFFF8
+  - Name:            .data2
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "40414243"
+    Address:         0x10100
+  - Name:            .data3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         "5051525354555657585960"
+    Address:         0x10FFF8
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x10100
+    Size:            0x1000
+  - Name:            .dummy
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x20FFF8
+    Size:            3
+  - Name:            .nonalloc
+    Type:            SHT_PROGBITS
+    Flags:           [ ]
+    Address:         0x300000
+    Size:            1
+ProgramHeaders:
+  - Type:     PT_LOAD
+    Flags:    [ PF_X, PF_R ]
+    VAddr:    0xF00000000
+    PAddr:    0x100000
+    FirstSec: .text
+    LastSec:  .bss
diff --git a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
index 1df63406a413..96dcd72b672c 100644
--- a/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
+++ b/llvm/test/tools/llvm-objcopy/XCOFF/invalid-read.test
@@ -5,7 +5,7 @@
 # RUN: yaml2obj %s --docnum=1 -o %t1
 # RUN: not llvm-objcopy %t1 %t1.out 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1
 
-# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file
+# ERROR1: error: '[[FILE]]': The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
@@ -13,6 +13,7 @@ FileHeader:
 Sections:
   - SectionData:      '00007400'
     FileOffsetToData: 0x70
+    Size: 0x20
 
 ## Failed to read relocations.
 # RUN: yaml2obj %s --docnum=2 -o %t2
@@ -35,12 +36,13 @@ Sections:
 # RUN: yaml2obj %s --docnum=3 -o %t3
 # RUN: not llvm-objcopy %t3 %t3.out 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=ERROR3
 
-# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x24 goes past the end of the file
+# ERROR3: error: '[[FILE]]': The end of the file was unexpectedly encountered: symbol table with offset 0x15 and size 0x36 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
   MagicNumber:         0x01DF
   OffsetToSymbolTable: 0x15
+  EntriesInSymbolTable: 3
 Symbols:
   - Name:         foo
     AuxEntries:
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index e296d7fb1fc8..ca136a6a0d5b 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -18,6 +18,11 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX11--------------------------------------
 ;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx11-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1151 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -49,6 +54,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: diff %t-specify.txt %t-detect.txt
 
 ; ----------------------------------GFX10--------------------------------------
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx10.3-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx10.3-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1036 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1036 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -84,6 +94,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
 ; RUN: diff %t-specify.txt %t-detect.txt
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx10.1-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx10.1-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1013 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
@@ -107,6 +122,11 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX9---------------------------------------
 ;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx942 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
index 91354f5a951e..96cac6b849fd 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-traceback-table.test
@@ -112,4 +112,4 @@ Symbols:
 # CHECK-NEXT:      70: 00 00 00 00  
 # CHECK-NEXT:        ...
 # CHECK-NEXT:      7c: 00 12 34 00  
-# CHECK-NEXT:      80: 00 00 00 00  
+# CHECK-NEXT:      80: 00 00 00     
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
index e80d5f6a711b..1a110fb9642b 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/section-headers.test
@@ -10,7 +10,7 @@
 # CHECK-NEXT:   1 .data         00000004 00000000 DATA
 # CHECK-NEXT:   2 .bss          00000000 00000010 BSS
 # CHECK-NEXT:   3 .tdata        00000004 00000000 DATA
-# CHECK-NEXT:   4 .tbss         00000000 00000000 BSS
+# CHECK-NEXT:   4 .tbss         00000000 00000004 BSS
 # CHECK-NEXT:   5 .dwline       00000046 00000000 DEBUG
 # CHECK-NEXT:   6 .debug        00000046 00000000 DEBUG
 
diff --git a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
index 9c1e90a2d896..f4abf1253d4b 100644
--- a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
+++ b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
@@ -2,8 +2,8 @@
 # RUN: llvm-objdump -t %t.so | FileCheck %s
 #
 # CHECK:      SYMBOL TABLE:
-# CHECK-NEXT: 00000001 g F CODE my_func_export
-# CHECK-NEXT: 0000002a g O DATA my_global_export
+# CHECK-NEXT: 00000001 g F CODE 00000003 my_func_export
+# CHECK-NEXT: 0000002a g O DATA 00000000 my_global_export
 
 --- !WASM
 FileHeader:
diff --git a/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
new file mode 100644
index 000000000000..dc87e62bcaac
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/linked-symbol-table-namesec.yaml
@@ -0,0 +1,87 @@
+# RUN: yaml2obj %s -o %t.wasm
+# RUN: llvm-objdump -t %t.wasm | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 00000000   F *UND* 00000000 my_func_import_name
+# CHECK-NEXT: 00000083 g F CODE 00000003 my_func_export_name
+# CHECK-NEXT: 00000086 l F CODE 00000003 my_func_local_name
+# CHECK-NEXT: 00000000    *UND* 00000000 my_global_import_name
+# CHECK-NEXT: 00000001 g  GLOBAL 00000000 my_global_export_name
+# CHECK-NEXT: 00000000 l O DATA 00000004 my_datasegment_name
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+      - Index:           2
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         'abcd1234'
+  - Type:            CUSTOM
+    Name:            name
+    FunctionNames:
+     - Index:        0
+       Name:         my_func_import_name
+     - Index:        1
+       Name:         my_func_export_name
+     - Index:        2
+       Name:         my_func_local_name
+    GlobalNames:
+     - Index:        0
+       Name:         my_global_import_name
+     - Index:        1
+       Name:         my_global_export_name
+    DataSegmentNames:
+     - Index:        0
+       Name:         my_datasegment_name
diff --git a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
index b7301a201fdd..ccb07461545a 100644
--- a/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
+++ b/llvm/test/tools/llvm-objdump/wasm/symbol-table.test
@@ -1,9 +1,9 @@
 RUN: llvm-objdump -t %p/Inputs/trivial.obj.wasm | FileCheck %s
 
 CHECK:      SYMBOL TABLE:
-CHECK-NEXT: 00000001 g     F CODE	main
-CHECK-NEXT: 00000000 l     O DATA	.L.str
-CHECK-NEXT: 00000000       F *UND*	puts
-CHECK-NEXT: 00000019 l     F CODE	.LSomeOtherFunction_bitcast
-CHECK-NEXT: 00000000       F *UND*	SomeOtherFunction
-CHECK-NEXT: 00000010 g     O DATA	var
+CHECK-NEXT: 00000001 g     F CODE	00000018	main
+CHECK-NEXT: 00000000 l     O DATA	0000000d	.L.str
+CHECK-NEXT: 00000000       F *UND*	00000000	puts
+CHECK-NEXT: 00000019 l     F CODE	0000000b	.LSomeOtherFunction_bitcast
+CHECK-NEXT: 00000000       F *UND*	00000000	SomeOtherFunction
+CHECK-NEXT: 00000010 g     O DATA	00000004	var
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index e2266d81d1a5..7fbf4aae6c24 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -253,6 +253,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013 -DFLAG_VALUE=0x42
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC -DFLAG_VALUE=0x52
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1013 -DFLAG_VALUE=0x42
 
@@ -322,6 +325,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1036
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1036 -DFLAG_VALUE=0x45
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC -DFLAG_VALUE=0x53
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME="EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_FEATURE_XNACK_V3"
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,DOUBLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_0="EF_AMDGPU_FEATURE_XNACK_V3 (0x100)" -DFLAG_1="EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F)" -DFLAG_VALUE=0x13F
 
@@ -355,6 +361,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=16 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX90A
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,UNKNOWN-ABI-VERSION --match-full-lines -DABI_VERSION=16 -DFILE=%t -DFLAG_VALUE=0x3F
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC -DFLAG_VALUE=0x51
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 -DFLAG_VALUE=0x41
 
@@ -391,6 +400,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1103
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1103 -DFLAG_VALUE=0x44
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC -DFLAG_VALUE=0x54
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1150
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1150 -DFLAG_VALUE=0x43
 
diff --git a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
index 8cbd84749fd4..2407aef72648 100644
--- a/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
+++ b/llvm/test/tools/llvm-readobj/XCOFF/file-header.test
@@ -23,7 +23,6 @@ FileHeader:
   CreationTime:         [[CREATTIME=1]]
   EntriesInSymbolTable: [[SYMBOLCOUNT=1]]
   NumberOfSections:     1
-  OffsetToSymbolTable:  0x3C
   AuxiliaryHeaderSize:  0
   Flags:                0x12
 Sections:
@@ -42,7 +41,7 @@ Symbols:
 # FILEHEADER64-NEXT:  Magic: 0x1F7
 # FILEHEADER64-NEXT:  NumberOfSections: 1
 # FILEHEADER64-NEXT:  TimeStamp: None (0x0)
-# FILEHEADER64-NEXT:  SymbolTableOffset: 0x3C
+# FILEHEADER64-NEXT:  SymbolTableOffset: 0x60
 # FILEHEADER64-NEXT:  SymbolTableEntries: 1
 # FILEHEADER64-NEXT:  OptionalHeaderSize: 0x0
 # FILEHEADER64-NEXT:  Flags: 0x12
diff --git a/llvm/test/tools/llvm-readobj/XCOFF/sections.test b/llvm/test/tools/llvm-readobj/XCOFF/sections.test
index be098939ce77..36e85d603365 100644
--- a/llvm/test/tools/llvm-readobj/XCOFF/sections.test
+++ b/llvm/test/tools/llvm-readobj/XCOFF/sections.test
@@ -13,7 +13,7 @@
 # SEC32-NEXT:    Name: .text
 # SEC32-NEXT:    PhysicalAddress: 0x0
 # SEC32-NEXT:    VirtualAddress: 0x0
-# SEC32-NEXT:    Size: 0x4
+# SEC32-NEXT:    Size: 0x2
 # SEC32-NEXT:    RawDataOffset: 0x64
 # SEC32-NEXT:    RelocationPointer: 0x0
 # SEC32-NEXT:    LineNumberPointer: 0x0
@@ -24,11 +24,11 @@
 # SEC32-NEXT:  Section {
 # SEC32-NEXT:    Index: 2
 # SEC32-NEXT:    Name: .data
-# SEC32-NEXT:    PhysicalAddress: 0x4
-# SEC32-NEXT:    VirtualAddress: 0x4
-# SEC32-NEXT:    Size: 0x4
+# SEC32-NEXT:    PhysicalAddress: 0x0
+# SEC32-NEXT:    VirtualAddress: 0x0
+# SEC32-NEXT:    Size: 0x2
 # SEC32-NEXT:    RawDataOffset: 0x68
-# SEC32-NEXT:    RelocationPointer: 0x6C
+# SEC32-NEXT:    RelocationPointer: 0x6A
 # SEC32-NEXT:    LineNumberPointer: 0x0
 # SEC32-NEXT:    NumberOfRelocations: 1
 # SEC32-NEXT:    NumberOfLineNumbers: 0
@@ -65,7 +65,7 @@ Sections:
 # SEC64-NEXT:    Name: .text
 # SEC64-NEXT:    PhysicalAddress: 0x0
 # SEC64-NEXT:    VirtualAddress: 0x0
-# SEC64-NEXT:    Size: 0x4
+# SEC64-NEXT:    Size: 0x2
 # SEC64-NEXT:    RawDataOffset: 0xA8
 # SEC64-NEXT:    RelocationPointer: 0x0
 # SEC64-NEXT:    LineNumberPointer: 0x0
@@ -76,11 +76,11 @@ Sections:
 # SEC64-NEXT:  Section {
 # SEC64-NEXT:    Index: 2
 # SEC64-NEXT:    Name: .data
-# SEC64-NEXT:    PhysicalAddress: 0x4
-# SEC64-NEXT:    VirtualAddress: 0x4
-# SEC64-NEXT:    Size: 0x4
+# SEC64-NEXT:    PhysicalAddress: 0x0
+# SEC64-NEXT:    VirtualAddress: 0x0
+# SEC64-NEXT:    Size: 0x2
 # SEC64-NEXT:    RawDataOffset: 0xAC
-# SEC64-NEXT:    RelocationPointer: 0xB0
+# SEC64-NEXT:    RelocationPointer: 0xAE
 # SEC64-NEXT:    LineNumberPointer: 0x0
 # SEC64-NEXT:    NumberOfRelocations: 1
 # SEC64-NEXT:    NumberOfLineNumbers: 0
diff --git a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
index fbd5fa0629d1..12f44d04f720 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/aix.yaml
@@ -31,7 +31,7 @@
 # CHECK32-NEXT:         Info:            0xF
 # CHECK32-NEXT:         Type:            0x3
 # CHECK32-NEXT:   - Name:            .data
-# CHECK32-NEXT:     Address:         0x4
+# CHECK32-NEXT:     Address:         0x0
 # CHECK32-NEXT:     Size:            0x4
 # CHECK32-NEXT:     FileOffsetToData: 0x68
 # CHECK32-NEXT:     FileOffsetToRelocations: 0x76
@@ -56,7 +56,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 0
 # CHECK32-NEXT:         TypeChkSectNum:  0
-# CHECK32-NEXT:         SymbolAlignmentAndType: 0
+# CHECK32-NEXT:         SymbolType:      XTY_ER
+# CHECK32-NEXT:         SymbolAlignment: 0
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 0
 # CHECK32-NEXT:         StabInfoIndex:   0
@@ -71,7 +72,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 0
 # CHECK32-NEXT:         TypeChkSectNum:  0
-# CHECK32-NEXT:         SymbolAlignmentAndType: 0
+# CHECK32-NEXT:         SymbolType:      XTY_ER
+# CHECK32-NEXT:         SymbolAlignment: 0
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 0
 # CHECK32-NEXT:         StabInfoIndex:   0
@@ -103,7 +105,7 @@
 # CHECK64-NEXT:         Info:            0xF
 # CHECK64-NEXT:         Type:            0x3
 # CHECK64-NEXT:   - Name:            .data
-# CHECK64-NEXT:     Address:         0x4
+# CHECK64-NEXT:     Address:         0x0
 # CHECK64-NEXT:     Size:            0x4
 # CHECK64-NEXT:     FileOffsetToData: 0xAC
 # CHECK64-NEXT:     FileOffsetToRelocations: 0xBE
@@ -128,7 +130,8 @@
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 0
 # CHECK64-NEXT:         TypeChkSectNum:  0
-# CHECK64-NEXT:         SymbolAlignmentAndType: 0
+# CHECK64-NEXT:         SymbolType:      XTY_ER
+# CHECK64-NEXT:         SymbolAlignment: 0
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 0
 # CHECK64-NEXT:         SectionOrLengthHi: 0
@@ -142,7 +145,8 @@
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 0
 # CHECK64-NEXT:         TypeChkSectNum:  0
-# CHECK64-NEXT:         SymbolAlignmentAndType: 0
+# CHECK64-NEXT:         SymbolType:      XTY_ER
+# CHECK64-NEXT:         SymbolAlignment: 0
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 0
 # CHECK64-NEXT:         SectionOrLengthHi: 0
diff --git a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
index 7f93b8dae0ca..8155ac1acd18 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/aux-symbols.yaml
@@ -34,7 +34,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 1
 # CHECK32-NEXT:         TypeChkSectNum:  2
-# CHECK32-NEXT:         SymbolAlignmentAndType: 41
+# CHECK32-NEXT:         SymbolType: XTY_SD
+# CHECK32-NEXT:         SymbolAlignment: 5
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 3
 # CHECK32-NEXT:         StabInfoIndex:   4
@@ -54,7 +55,8 @@
 # CHECK32-NEXT:       - Type:            AUX_CSECT
 # CHECK32-NEXT:         ParameterHashIndex: 1
 # CHECK32-NEXT:         TypeChkSectNum:  2
-# CHECK32-NEXT:         SymbolAlignmentAndType: 17
+# CHECK32-NEXT:         SymbolType: XTY_SD
+# CHECK32-NEXT:         SymbolAlignment: 2
 # CHECK32-NEXT:         StorageMappingClass: XMC_PR
 # CHECK32-NEXT:         SectionOrLength: 4
 # CHECK32-NEXT:         StabInfoIndex:   5
@@ -174,7 +176,8 @@ Symbols:
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 1
 # CHECK64-NEXT:         TypeChkSectNum:  2
-# CHECK64-NEXT:         SymbolAlignmentAndType: 41
+# CHECK64-NEXT:         SymbolType: XTY_SD
+# CHECK64-NEXT:         SymbolAlignment: 5
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 3
 # CHECK64-NEXT:         SectionOrLengthHi: 4
@@ -196,7 +199,8 @@ Symbols:
 # CHECK64-NEXT:       - Type:            AUX_CSECT
 # CHECK64-NEXT:         ParameterHashIndex: 1
 # CHECK64-NEXT:         TypeChkSectNum:  2
-# CHECK64-NEXT:         SymbolAlignmentAndType: 17
+# CHECK64-NEXT:         SymbolType: XTY_SD
+# CHECK64-NEXT:         SymbolAlignment: 2
 # CHECK64-NEXT:         StorageMappingClass: XMC_PR
 # CHECK64-NEXT:         SectionOrLengthLo: 3
 # CHECK64-NEXT:         SectionOrLengthHi: 4
diff --git a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
index 1e16c5f66baa..0e16a47ae6a9 100644
--- a/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
+++ b/llvm/test/tools/obj2yaml/XCOFF/invalid-section.yaml
@@ -5,7 +5,7 @@
 # RUN: yaml2obj %s --docnum=1 -o %t1
 # RUN: not obj2yaml %t1 2>&1 | FileCheck %s -DFILE=%t1 --check-prefix=ERROR1
 
-# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x4 goes past the end of the file
+# ERROR1: The end of the file was unexpectedly encountered: section data with offset 0x70 and size 0x20 goes past the end of the file
 
 --- !XCOFF
 FileHeader:
@@ -13,6 +13,7 @@ FileHeader:
 Sections:
   - SectionData:      '00007400'
     FileOffsetToData: 0x70
+    Size: 0x20
 
 ## Error2: failed to get relocations.
 # RUN: yaml2obj %s --docnum=2 -o %t2
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml
new file mode 100644
index 000000000000..190224dd6206
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-aligntype.yaml
@@ -0,0 +1,114 @@
+## Check that yaml2obj can parse SymbolAlignmentAndType, SymbolAlignment,
+## and SymbolType.
+
+# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01DF -o %t32
+# RUN: obj2yaml %t32 | FileCheck %s --check-prefix=CHECK
+# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x01F7 -o %t64
+# RUN: obj2yaml %t64 | FileCheck %s --check-prefix=CHECK
+
+# CHECK:        --- !XCOFF
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   MagicNumber:
+# CHECK:      Symbols:
+# CHECK:       - Name:            .fcn1
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_ER
+# CHECK-NEXT:        SymbolAlignment: 4
+# CHECK:       - Name:            .fcn2
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_SD
+# CHECK-NEXT:        SymbolAlignment: 2
+# CHECK:       - Name:            .fcn3
+# CHECK:         NumberOfAuxEntries: 1
+# CHECK-NEXT:    AuxEntries:
+# CHECK-NEXT:      - Type:            AUX_CSECT
+# CHECK:             SymbolType:      XTY_SD
+# CHECK-NEXT:        SymbolAlignment: 0
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: [[MAGIC]]
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignment: 4
+  - StorageClass: C_EXT
+    Name: .fcn2
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignment: 2
+        SymbolType: XTY_SD
+  - StorageClass:    C_EXT
+    Name: .fcn3
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolType: XTY_SD
+
+## Ensure that SymbolAlignment is in range.
+# RUN: not yaml2obj %s --docnum=2 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR1
+# ERROR1: symbol alignment must be less than 32
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1F7
+Symbols:
+  - StorageClass:    C_EXT
+    Name:               .fcn1
+    AuxEntries:
+      - Type:               AUX_CSECT
+        SymbolType: XTY_SD
+        SymbolAlignment: 32
+        SectionOrLengthLo:    4
+
+## Ensure that neither SymbolAlignment nor SymbolType can be specified if
+## SymbolAlignmentAndType is specified.
+# RUN: not yaml2obj %s --docnum=3 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+# ERROR2: cannot specify SymbolType or SymbolAlignment if SymbolAlignmentAndType is specified
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 17
+        SymbolAlignment: 4
+        SectionOrLength: 4
+
+# RUN: not yaml2obj %s --docnum=4 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - StorageClass: C_EXT
+    Name: .fcn1
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 17
+        SymbolAlignment: 4
+        SymbolType: XTY_CM
+        SectionOrLength: 4
+
+# RUN: not yaml2obj %s --docnum=5 -o %t 2>&1 | FileCheck %s --check-prefix=ERROR2
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1F7
+Symbols:
+  - StorageClass: C_EXT
+  - StorageClass: C_EXT
+    Name: .fcn2
+    AuxEntries:
+      - Type: AUX_CSECT
+        SymbolAlignmentAndType: 18
+        SymbolType: XTY_SD
+        SectionOrLengthLo: 4
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
index f6d6193755ef..a93123b231ba 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-hdr-defaults.yaml
@@ -10,12 +10,12 @@
 # CASE1:      AuxiliaryHeader {
 # CASE1-NEXT:   Magic: 0x10B
 # CASE1-NEXT:   Version: 0x1
-# CASE1-NEXT:   Size of .text section: 0x8
-# CASE1-NEXT:   Size of .data section: 0x8
-# CASE1-NEXT:   Size of .bss section: 0x8
+# CASE1-NEXT:   Size of .text section: 0x5
+# CASE1-NEXT:   Size of .data section: 0x5
+# CASE1-NEXT:   Size of .bss section: 0x5
 # CASE1-NEXT:   Entry point address: 0x0
-# CASE1-NEXT:   .text section start address: 0x4
-# CASE1-NEXT:   .data section start address: 0x10
+# CASE1-NEXT:   .text section start address: 0x0
+# CASE1-NEXT:   .data section start address: 0x0
 # CASE1-NEXT:   TOC anchor address: 0x0
 # CASE1-NEXT:   Section number of entryPoint: 0
 # CASE1-NEXT:   Section number of .text: 2
@@ -79,16 +79,12 @@ Sections:
 # RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2
 # RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2
 
-## Case2: same as case1, except it is 64-bit.
-# RUN: yaml2obj %s --docnum=1 -DMAGIC=0x1F7 -o %t2
-# RUN: llvm-readobj --auxiliary-header %t2 | FileCheck %s --check-prefix=CASE2
-
 # CASE2:      AuxiliaryHeader {
 # CASE2-NEXT:   Magic: 0x10B
 # CASE2-NEXT:   Version: 0x1
 # CASE2-NEXT:   Reserved for debugger: 0x0
-# CASE2-NEXT:   .text section start address: 0x2
-# CASE2-NEXT:   .data section start address: 0xE
+# CASE2-NEXT:   .text section start address: 0x0
+# CASE2-NEXT:   .data section start address: 0x0
 # CASE2-NEXT:   TOC anchor address: 0x0
 # CASE2-NEXT:   Section number of entryPoint: 0
 # CASE2-NEXT:   Section number of .text: 2
@@ -106,9 +102,9 @@ Sections:
 # CASE2-NEXT:   Stack page size: 0x0
 # CASE2-NEXT:   Flag: 0x80
 # CASE2-NEXT:   Alignment of thread-local storage: 0x0
-# CASE2-NEXT:   Size of .text section: 0x8
-# CASE2-NEXT:   Size of .data section: 0x8
-# CASE2-NEXT:   Size of .bss section: 0x8
+# CASE2-NEXT:   Size of .text section: 0x5
+# CASE2-NEXT:   Size of .data section: 0x5
+# CASE2-NEXT:   Size of .bss section: 0x5
 # CASE2-NEXT:   Entry point address: 0x0
 # CASE2-NEXT:   Maximum stack size: 0x0
 # CASE2-NEXT:   Maximum data size: 0x0
diff --git a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
index fe75c1941bc1..04c774dcc3ae 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/aux-symbols.yaml
@@ -579,3 +579,28 @@ Symbols:
     AuxEntries:
       - Type: AUX_FILE
         FileNameOrString: foo
+
+## Case10: Specify a SymbolType outside the range of field definition.
+# RUN: not yaml2obj %s -DSYMTYPE=8 --docnum=8 -o %t10 2>&1 | \ 
+# RUN:   FileCheck %s --check-prefix BADSYMTYPE
+
+# BADSYMTYPE: error: symbol type must be less than 8
+
+## Case11: Specify a SymbolType outside the range of its enumeration.
+# RUN: yaml2obj %s -DSYMTYPE=7 --docnum=8 -o %t11
+# RUN: llvm-readobj --syms %t11 | FileCheck %s --check-prefix=STYPE
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: 0x1DF
+Symbols:
+  - Name:               aux_fcn_csect
+    StorageClass:       C_EXT
+    Type:               0x20
+    AuxEntries:
+      - Type:                   AUX_CSECT
+        SymbolAlignment: 4
+        SymbolType: [[SYMTYPE=<none>]]
+
+# STYPE:      SymbolAlignmentLog2: 4
+# STYPE-NEXT:   SymbolType: 0x7
diff --git a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
index ed85bc639d0a..8c3d77d842b0 100644
--- a/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
+++ b/llvm/test/tools/yaml2obj/XCOFF/basic-doc.yaml
@@ -65,8 +65,8 @@ Symbols:
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Index: 2
 # CHECK-NEXT:     Name: .data
-# CHECK-NEXT:     PhysicalAddress: 0x8
-# CHECK-NEXT:     VirtualAddress: 0x8
+# CHECK-NEXT:     PhysicalAddress: 0x0
+# CHECK-NEXT:     VirtualAddress: 0x0
 # CHECK-NEXT:     Size: 0x8
 # CHECK-NEXT:     RawDataOffset: 0xE4
 # CHECK-NEXT:     RelocationPointer: 0xF0
diff --git a/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml
new file mode 100644
index 000000000000..ee23a16d8c16
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/XCOFF/offset-check.yaml
@@ -0,0 +1,91 @@
+## Report errors when specified file offsets are invalid.
+
+# RUN: not yaml2obj %s -DTEXTRAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR1
+# ERROR1: current file offset (4294967288) is bigger than the specified FileOffsetToData for the .data section (108)
+
+# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR2
+# ERROR2: current file offset (4294967284) is bigger than the specified FileOffsetToRelocations for the .text section (112)
+
+# RUN: not yaml2obj %s -DRELOCOFFSET=0xFFFFFFF0 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR3
+# ERROR3: current file offset (4294967290) is bigger than the specified SymbolTableOffset (122)
+
+# RUN: not yaml2obj %s -DSYMTABOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR4
+# ERROR4: maximum object size of 4294967295 exceeded when writing symbols
+
+# RUN: not yaml2obj %s -DRELOCOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR5
+# ERROR5: maximum object size (4294967295) exceeded when writing relocation data for section .text
+
+# RUN: not yaml2obj %s -DDATARAWDATAOFFSET=0x100000000 -o %t 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERROR6
+# ERROR6: maximum object size (4294967295) exceeded when writing data for section 2 (.data)
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1DF
+  NumberOfSections: 2
+  OffsetToSymbolTable: [[SYMTABOFFSET=0x7A]]
+  EntriesInSymbolTable: 6
+  AuxiliaryHeaderSize: 0
+  Flags:           0x0
+Sections:
+  - Name:            .text
+    Address:         0x0
+    Size:            0x8
+    FileOffsetToData: [[TEXTRAWDATAOFFSET=0x64]]
+    FileOffsetToRelocations: [[RELOCOFFSET=0x70]]
+    NumberOfRelocations: 0x1
+    Flags:           [ STYP_TEXT ]
+    SectionData:     386000004BFFFFFC
+    Relocations:
+      - Address:         0x4
+        Symbol:          0x2
+        Info:            0x99
+        Type:            0x1A
+  - Name:            .data
+    Address:         0x0
+    Size:            0x4
+    FileOffsetToData: [[DATARAWDATAOFFSET=0x6C]]
+    FileOffsetToRelocations: 0
+    Flags:           [ STYP_DATA ]
+    SectionData:     3210ABCD
+Symbols:
+  - Name:            .bar
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 0
+  - Name:            ''
+    Value:           0x0
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 17
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 8
+  - Name:            .foo
+    Value:           0x0
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 2
+StringTable:     {}
+...
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 3e2567c441df..b292f70ba89d 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -365,15 +365,11 @@ int main(int argc, char **argv) {
   }
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (TimeTrace)
     timeTraceProfilerInitialize(TimeTraceGranularity, argv[0]);
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 559e7a604898..8cdd84bcc867 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -845,8 +845,9 @@ int main(int argc, char **argv) {
 
   bool OffsetRequested = false;
 
-  // Defaults to dumping all sections, unless brief mode is specified in which
-  // case only the .debug_info section in dumped.
+  // Defaults to dumping only debug_info, unless: A) verbose mode is specified,
+  // in which case all sections are dumped, or B) a specific section is
+  // requested.
 #define HANDLE_DWARF_SECTION(ENUM_NAME, ELF_NAME, CMDLINE_NAME, OPTION)        \
   if (Dump##ENUM_NAME.IsRequested) {                                           \
     DumpType |= DIDT_##ENUM_NAME;                                              \
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index e5ae726546d4..2de9c76fd68c 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -43,6 +43,7 @@
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
 #include "llvm/DebugInfo/GSYM/LookupResult.h"
 #include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include <optional>
 
 using namespace llvm;
@@ -300,16 +301,10 @@ static std::optional<uint64_t> getImageBaseAddress(object::ObjectFile &Obj) {
   return std::nullopt;
 }
 
-static llvm::Error handleObjectFile(ObjectFile &Obj,
-                                    const std::string &OutFile) {
+static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile,
+                                    OutputAggregator &Out) {
   auto ThreadCount =
       NumThreads > 0 ? NumThreads : std::thread::hardware_concurrency();
-  auto &OS = outs();
-  // Make a stream refernce that will become a /dev/null log stream if
-  // Quiet is true, or normal output if Quiet is false. This can stop the
-  // errors and warnings from being displayed and producing too much output
-  // when they aren't desired.
-  raw_ostream *LogOS = Quiet ? nullptr : &outs();
 
   GsymCreator Gsym(Quiet);
 
@@ -354,17 +349,17 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
     Gsym.SetValidTextRanges(TextRanges);
 
   // Convert all DWARF to GSYM.
-  if (auto Err = DT.convert(ThreadCount, LogOS))
+  if (auto Err = DT.convert(ThreadCount, Out))
     return Err;
 
   // Get the UUID and convert symbol table to GSYM.
-  if (auto Err = ObjectFileTransformer::convert(Obj, LogOS, Gsym))
+  if (auto Err = ObjectFileTransformer::convert(Obj, Out, Gsym))
     return Err;
 
   // Finalize the GSYM to make it ready to save to disk. This will remove
   // duplicate FunctionInfo entries where we might have found an entry from
   // debug info and also a symbol table entry from the object file.
-  if (auto Err = Gsym.finalize(OS))
+  if (auto Err = Gsym.finalize(Out))
     return Err;
 
   // Save the GSYM file to disk.
@@ -381,7 +376,7 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
   // Verify the DWARF if requested. This will ensure all the info in the DWARF
   // can be looked up in the GSYM and that all lookups get matching data.
   if (Verify) {
-    if (auto Err = DT.verify(OutFile, OS))
+    if (auto Err = DT.verify(OutFile, Out))
       return Err;
   }
 
@@ -389,7 +384,8 @@ static llvm::Error handleObjectFile(ObjectFile &Obj,
 }
 
 static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
-                                const std::string &OutFile) {
+                                const std::string &OutFile,
+                                OutputAggregator &Out) {
   Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
   error(Filename, errorToErrorCode(BinOrErr.takeError()));
 
@@ -397,7 +393,7 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
     Triple ObjTriple(Obj->makeTriple());
     auto ArchName = ObjTriple.getArchName();
     outs() << "Output file (" << ArchName << "): " << OutFile << "\n";
-    if (auto Err = handleObjectFile(*Obj, OutFile))
+    if (auto Err = handleObjectFile(*Obj, OutFile, Out))
       return Err;
   } else if (auto *Fat = dyn_cast<MachOUniversalBinary>(BinOrErr->get())) {
     // Iterate over all contained architectures and filter out any that were
@@ -431,7 +427,7 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
         ArchOutFile.append(ArchName.str());
       }
       outs() << "Output file (" << ArchName << "): " << ArchOutFile << "\n";
-      if (auto Err = handleObjectFile(*Obj, ArchOutFile))
+      if (auto Err = handleObjectFile(*Obj, ArchOutFile, Out))
         return Err;
     }
   }
@@ -439,15 +435,16 @@ static llvm::Error handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
 }
 
 static llvm::Error handleFileConversionToGSYM(StringRef Filename,
-                                              const std::string &OutFile) {
+                                              const std::string &OutFile,
+                                              OutputAggregator &Out) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   error(Filename, BuffOrErr.getError());
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BuffOrErr.get());
-  return handleBuffer(Filename, *Buffer, OutFile);
+  return handleBuffer(Filename, *Buffer, OutFile, Out);
 }
 
-static llvm::Error convertFileToGSYM(raw_ostream &OS) {
+static llvm::Error convertFileToGSYM(OutputAggregator &Out) {
   // Expand any .dSYM bundles to the individual object files contained therein.
   std::vector<std::string> Objects;
   std::string OutFile = OutputFilename;
@@ -456,7 +453,7 @@ static llvm::Error convertFileToGSYM(raw_ostream &OS) {
     OutFile += ".gsym";
   }
 
-  OS << "Input file: " << ConvertFilename << "\n";
+  Out << "Input file: " << ConvertFilename << "\n";
 
   if (auto DsymObjectsOrErr =
           MachOObjectFile::findDsymObjectMembers(ConvertFilename)) {
@@ -469,7 +466,7 @@ static llvm::Error convertFileToGSYM(raw_ostream &OS) {
   }
 
   for (StringRef Object : Objects)
-    if (Error Err = handleFileConversionToGSYM(Object, OutFile))
+    if (Error Err = handleFileConversionToGSYM(Object, OutFile, Out))
       return Err;
   return Error::success();
 }
@@ -507,6 +504,7 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
 
   raw_ostream &OS = outs();
 
+  OutputAggregator Aggregation(&OS);
   if (!ConvertFilename.empty()) {
     // Convert DWARF to GSYM
     if (!InputFilenames.empty()) {
@@ -515,8 +513,12 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
       return 1;
     }
     // Call error() if we have an error and it will exit with a status of 1
-    if (auto Err = convertFileToGSYM(OS))
+    if (auto Err = convertFileToGSYM(Aggregation))
       error("DWARF conversion failed: ", std::move(Err));
+    // Report the errors from aggregator:
+    Aggregation.EnumerateResults([&](StringRef category, unsigned count) {
+      OS << category << " occurred " << count << " time(s)\n";
+    });
     return 0;
   }
 
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index d50e0678f461..e6c219a8cd7e 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -473,15 +473,11 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   LLVMContext Context;
   Context.setDiagnosticHandler(std::make_unique<LLVMLinkDiagnosticHandler>(),
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f27281438282..7943d6952b82 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -945,15 +945,11 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (OptLevel < '0' || OptLevel > '3')
     error("optimization level must be between 0 and 3");
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index c212374a0ecc..d5de4f6b1a27 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -230,15 +230,11 @@ static int run(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "Resolution-based LTO test harness");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   // FIXME: Workaround PR30396 which means that a symbol can appear
   // more than once if it is defined in module-level assembly and
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index da5998b70ea3..e3b81451fcac 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -1854,11 +1854,8 @@ static bool getSymbolNamesFromObject(SymbolicFile &Obj,
               dyn_cast<const XCOFFObjectFile>(&Obj))
         S.Size = XCOFFObj->getSymbolSize(Sym.getRawDataRefImpl());
 
-      if (const WasmObjectFile *WasmObj = dyn_cast<WasmObjectFile>(&Obj)) {
-        const WasmSymbol &WasmSym = WasmObj->getWasmSymbol(Sym);
-        if (WasmSym.isTypeData() && !WasmSym.isUndefined())
-          S.Size = WasmSym.Info.DataRef.Size;
-      }
+      if (const WasmObjectFile *WasmObj = dyn_cast<WasmObjectFile>(&Obj))
+        S.Size = WasmObj->getSymbolSize(Sym);
 
       if (PrintAddress && isa<ObjectFile>(Obj)) {
         SymbolRef SymRef(Sym);
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 394eacacb57e..9a9b631e98bc 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -687,6 +687,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   Config.OutputFormat = StringSwitch<FileFormat>(OutputFormat)
                             .Case("binary", FileFormat::Binary)
                             .Case("ihex", FileFormat::IHex)
+                            .Case("srec", FileFormat::SREC)
                             .Default(FileFormat::Unspecified);
   if (Config.OutputFormat == FileFormat::Unspecified) {
     if (OutputFormat.empty()) {
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index 730f423daa12..ad3e60472369 100644
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -121,6 +121,7 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr,
   case FileFormat::Binary:
   case FileFormat::IHex:
   case FileFormat::Unspecified:
+  case FileFormat::SREC:
     Expected<const ELFConfig &> ELFConfig = ConfigMgr.getELFConfig();
     if (!ELFConfig)
       return ELFConfig.takeError();
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index de52ebcc72fd..948a5d74e1ab 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2947,6 +2947,10 @@ void Dumper::printSymbol(const SymbolRef &Symbol,
                               Symbol.getRawDataRefImpl()));
   else if (O.isELF())
     outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize());
+  else if (O.isWasm())
+    outs() << '\t'
+           << format(Fmt, static_cast<uint64_t>(
+                              cast<WasmObjectFile>(O).getSymbolSize(Symbol)));
 
   if (O.isELF()) {
     if (!SymbolVersions.empty()) {
diff --git a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index 1beb2d282744..479d02583518 100644
--- a/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -213,6 +213,9 @@ static std::string formatSourceLanguage(SourceLanguage Lang) {
     RETURN_CASE(SourceLanguage, Rust, "rust");
     RETURN_CASE(SourceLanguage, ObjC, "objc");
     RETURN_CASE(SourceLanguage, ObjCpp, "objc++");
+    RETURN_CASE(SourceLanguage, AliasObj, "aliasobj");
+    RETURN_CASE(SourceLanguage, Go, "go");
+    RETURN_CASE(SourceLanguage, OldSwift, "swift");
   }
   return formatUnknownEnum(Lang);
 }
@@ -282,6 +285,7 @@ static std::string formatMachineType(CPUType Cpu) {
     RETURN_CASE(CPUType, Thumb, "thumb");
     RETURN_CASE(CPUType, ARMNT, "arm nt");
     RETURN_CASE(CPUType, D3D11_Shader, "d3d11 shader");
+    RETURN_CASE(CPUType, Unknown, "unknown");
   }
   return formatUnknownEnum(Cpu);
 }
diff --git a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index aaa430a9572e..a4077820eb03 100644
--- a/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -125,6 +125,7 @@ static std::string formatCallingConvention(CallingConvention Convention) {
     RETURN_CASE(CallingConvention, PpcCall, "ppccall");
     RETURN_CASE(CallingConvention, SHCall, "shcall");
     RETURN_CASE(CallingConvention, SH5Call, "sh5call");
+    RETURN_CASE(CallingConvention, Swift, "swift");
     RETURN_CASE(CallingConvention, ThisCall, "thiscall");
     RETURN_CASE(CallingConvention, TriCall, "tricall");
   }
diff --git a/llvm/tools/llvm-readobj/COFFImportDumper.cpp b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
index 656ca32f03a7..0ab2a1765565 100644
--- a/llvm/tools/llvm-readobj/COFFImportDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFImportDumper.cpp
@@ -45,6 +45,9 @@ void dumpCOFFImportFile(const COFFImportFile *File, ScopedPrinter &Writer) {
   case COFF::IMPORT_NAME_UNDECORATE:
     Writer.printString("Name type", "undecorate");
     break;
+  case COFF::IMPORT_NAME_EXPORTAS:
+    Writer.printString("Name type", "export as");
+    break;
   }
 
   if (H->getNameType() != COFF::IMPORT_ORDINAL)
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 82bb12f95d3a..8e68f08c3fa9 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1559,68 +1559,72 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
 };
 
 // clang-format off
-#define AMDGPU_MACH_ENUM_ENTS                                              \
-  ENUM_ENT(EF_AMDGPU_MACH_NONE, "none"),                                   \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_R600, "r600"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_R630, "r630"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RS880, "rs880"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV670, "rv670"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV710, "rv710"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV730, "rv730"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_RV770, "rv770"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CEDAR, "cedar"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CYPRESS, "cypress"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_JUNIPER, "juniper"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_REDWOOD, "redwood"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_SUMO, "sumo"),                              \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_BARTS, "barts"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CAICOS, "caicos"),                          \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_CAYMAN, "cayman"),                          \
-  ENUM_ENT(EF_AMDGPU_MACH_R600_TURKS, "turks"),                            \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX600, "gfx600"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX601, "gfx601"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX602, "gfx602"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX700, "gfx700"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX701, "gfx701"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX702, "gfx702"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX703, "gfx703"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX704, "gfx704"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX705, "gfx705"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX801, "gfx801"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX802, "gfx802"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX803, "gfx803"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX805, "gfx805"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX810, "gfx810"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX900, "gfx900"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX902, "gfx902"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX904, "gfx904"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX906, "gfx906"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX908, "gfx908"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                        \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1013, "gfx1013"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1030, "gfx1030"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1031, "gfx1031"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1032, "gfx1032"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1033, "gfx1033"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1034, "gfx1034"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1035, "gfx1035"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1036, "gfx1036"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1100, "gfx1100"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1101, "gfx1101"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1102, "gfx1102"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1103, "gfx1103"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1150, "gfx1150"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1151, "gfx1151"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                      \
-  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201")
+#define AMDGPU_MACH_ENUM_ENTS                                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_NONE, "none"),                                       \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_R600, "r600"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_R630, "r630"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RS880, "rs880"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV670, "rv670"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV710, "rv710"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV730, "rv730"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_RV770, "rv770"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CEDAR, "cedar"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CYPRESS, "cypress"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_JUNIPER, "juniper"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_REDWOOD, "redwood"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_SUMO, "sumo"),                                  \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_BARTS, "barts"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CAICOS, "caicos"),                              \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_CAYMAN, "cayman"),                              \
+  ENUM_ENT(EF_AMDGPU_MACH_R600_TURKS, "turks"),                                \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX600, "gfx600"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX601, "gfx601"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX602, "gfx602"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX700, "gfx700"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX701, "gfx701"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX702, "gfx702"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX703, "gfx703"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX704, "gfx704"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX705, "gfx705"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX801, "gfx801"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX802, "gfx802"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX803, "gfx803"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX805, "gfx805"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX810, "gfx810"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX900, "gfx900"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX902, "gfx902"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX904, "gfx904"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX906, "gfx906"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX908, "gfx908"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1013, "gfx1013"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1030, "gfx1030"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1031, "gfx1031"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1032, "gfx1032"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1033, "gfx1033"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1034, "gfx1034"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1035, "gfx1035"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1036, "gfx1036"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1100, "gfx1100"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1101, "gfx1101"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1102, "gfx1102"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1103, "gfx1103"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1150, "gfx1150"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1151, "gfx1151"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"),                \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10.1-generic"),          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, "gfx10.3-generic"),          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, "gfx11-generic")
 // clang-format on
 
 const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp
index 71ce0ca5ab6a..f913771487af 100644
--- a/llvm/tools/llvm-reduce/llvm-reduce.cpp
+++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp
@@ -151,15 +151,11 @@ int main(int Argc, char **Argv) {
   cl::ParseCommandLineOptions(Argc, Argv, "LLVM automatic testcase reducer.\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   if (Argc == 1) {
     cl::PrintHelpMessage();
diff --git a/llvm/tools/obj2yaml/xcoff2yaml.cpp b/llvm/tools/obj2yaml/xcoff2yaml.cpp
index 0acbf4866223..e426b645cbef 100644
--- a/llvm/tools/obj2yaml/xcoff2yaml.cpp
+++ b/llvm/tools/obj2yaml/xcoff2yaml.cpp
@@ -209,7 +209,9 @@ void XCOFFDumper::dumpCsectAuxSym(XCOFFYAML::Symbol &Sym,
   XCOFFYAML::CsectAuxEnt CsectAuxSym;
   CsectAuxSym.ParameterHashIndex = AuxEntPtr.getParameterHashIndex();
   CsectAuxSym.TypeChkSectNum = AuxEntPtr.getTypeChkSectNum();
-  CsectAuxSym.SymbolAlignmentAndType = AuxEntPtr.getSymbolAlignmentAndType();
+  CsectAuxSym.SymbolAlignment = AuxEntPtr.getAlignmentLog2();
+  CsectAuxSym.SymbolType =
+      static_cast<XCOFF::SymbolType>(AuxEntPtr.getSymbolType());
   CsectAuxSym.StorageMappingClass = AuxEntPtr.getStorageMappingClass();
 
   if (Obj.is64Bit()) {
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index fdfb4df53273..374698083763 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -202,6 +202,19 @@ static cl::opt<std::string>
                          cl::desc("Path to the profile remapping file."),
                          cl::Hidden);
 
+static cl::opt<PGOOptions::ColdFuncOpt> PGOColdFuncAttr(
+    "pgo-cold-func-opt", cl::init(PGOOptions::ColdFuncOpt::Default), cl::Hidden,
+    cl::desc(
+        "Function attribute to apply to cold functions as determined by PGO"),
+    cl::values(clEnumValN(PGOOptions::ColdFuncOpt::Default, "default",
+                          "Default (no attribute)"),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptSize, "optsize",
+                          "Mark cold functions with optsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::MinSize, "minsize",
+                          "Mark cold functions with minsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptNone, "optnone",
+                          "Mark cold functions with optnone.")));
+
 static cl::opt<bool> DebugInfoForProfiling(
     "debug-info-for-profiling", cl::init(false), cl::Hidden,
     cl::desc("Emit special debug info to enable PGO profile generation."));
@@ -341,22 +354,24 @@ bool llvm::runPassPipeline(
   switch (PGOKindFlag) {
   case InstrGen:
     P = PGOOptions(ProfileFile, "", "", MemoryProfileFile, FS,
-                   PGOOptions::IRInstr);
+                   PGOOptions::IRInstr, PGOOptions::NoCSAction,
+                   PGOColdFuncAttr);
     break;
   case InstrUse:
     P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS,
-                   PGOOptions::IRUse);
+                   PGOOptions::IRUse, PGOOptions::NoCSAction, PGOColdFuncAttr);
     break;
   case SampleUse:
     P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS,
-                   PGOOptions::SampleUse);
+                   PGOOptions::SampleUse, PGOOptions::NoCSAction,
+                   PGOColdFuncAttr);
     break;
   case NoPGO:
     if (DebugInfoForProfiling || PseudoProbeForProfiling ||
         !MemoryProfileFile.empty())
       P = PGOOptions("", "", "", MemoryProfileFile, FS, PGOOptions::NoAction,
-                     PGOOptions::NoCSAction, DebugInfoForProfiling,
-                     PseudoProbeForProfiling);
+                     PGOOptions::NoCSAction, PGOColdFuncAttr,
+                     DebugInfoForProfiling, PseudoProbeForProfiling);
     else
       P = std::nullopt;
   }
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 3f66bfc9f017..85f52941a85b 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -462,15 +462,11 @@ extern "C" int optMain(
       argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n");
 
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // new debug-info format.
   if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
+    // Turn the new debug-info format on.
     UseNewDbgInfoFormat = true;
   }
-#endif
-  (void)TryUseNewDbgInfoFormat;
 
   LLVMContext Context;
 
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index e00e11e4655a..6a4780c143e5 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -1143,6 +1143,9 @@ TYPED_TEST(BitVectorTest, EmptyVectorGetData) {
 }
 
 TYPED_TEST(BitVectorTest, Iterators) {
+  TypeParam Singleton(1, true);
+  EXPECT_EQ(std::next(Singleton.set_bits_begin()), Singleton.set_bits_end());
+
   TypeParam Filled(10, true);
   EXPECT_NE(Filled.set_bits_begin(), Filled.set_bits_end());
   unsigned Counter = 0;
diff --git a/llvm/unittests/ADT/IListIteratorBitsTest.cpp b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
index 167b30a5e308..8ae73b1ed5f7 100644
--- a/llvm/unittests/ADT/IListIteratorBitsTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
@@ -55,10 +55,8 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
 
   simple_ilist<Node, ilist_iterator_bits<true>>::iterator I, I2;
 
-// Two sets of tests: if we've compiled in the iterator bits, then check that
-// HeadInclusiveBit and TailInclusiveBit are preserved on assignment and copy
-// construction, but not on other operations.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  // Check that HeadInclusiveBit and TailInclusiveBit are preserved on
+  // assignment and copy construction, but not on other operations.
   I = L.begin();
   EXPECT_FALSE(I.getHeadBit());
   EXPECT_FALSE(I.getTailBit());
@@ -85,18 +83,6 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
   simple_ilist<Node, ilist_iterator_bits<true>>::iterator I3(I);
   EXPECT_TRUE(I3.getHeadBit());
   EXPECT_TRUE(I3.getTailBit());
-#else
-  // The calls should be available, but shouldn't actually store information.
-  I = L.begin();
-  EXPECT_FALSE(I.getHeadBit());
-  EXPECT_FALSE(I.getTailBit());
-  I.setHeadBit(true);
-  I.setTailBit(true);
-  EXPECT_FALSE(I.getHeadBit());
-  EXPECT_FALSE(I.getTailBit());
-  // Suppress warnings as we don't test with this variable.
-  (void)I2;
-#endif
 }
 
 class dummy {
diff --git a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
index d13cfeeffe56..cb34802a5de2 100644
--- a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -259,6 +259,7 @@ TEST(LowLevelTypeTest, Pointer) {
       // Test kind.
       ASSERT_TRUE(Ty.isValid());
       ASSERT_TRUE(Ty.isPointer());
+      ASSERT_TRUE(Ty.isPointerOrPointerVector());
 
       ASSERT_FALSE(Ty.isScalar());
       ASSERT_FALSE(Ty.isVector());
@@ -266,6 +267,8 @@ TEST(LowLevelTypeTest, Pointer) {
       ASSERT_TRUE(VTy.isValid());
       ASSERT_TRUE(VTy.isVector());
       ASSERT_TRUE(VTy.getElementType().isPointer());
+      ASSERT_TRUE(VTy.isPointerVector());
+      ASSERT_TRUE(VTy.isPointerOrPointerVector());
 
       EXPECT_EQ(Ty, VTy.getElementType());
       EXPECT_EQ(Ty.getSizeInBits(), VTy.getScalarSizeInBits());
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
index 38f3e076ace2..dedcf816cf63 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFAcceleratorTableTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
@@ -46,4 +47,256 @@ TEST(DWARFDebugNames, TooSmallForDWARF64) {
                         "data at offset 0x2b while reading [0x28, 0x2c)"));
 }
 
+TEST(DWARFDebugNames, BasicTestEntries) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+    - 'NameType2'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+        - 0x0      # DIE Offset
+    - Name:   0xa  # strp to NameType2
+      Code:   0x1
+      Values:
+        - 0x1      # Compile unit
+        - 0x1      # DIE Offset
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x2     # Compile unit
+        - 0x2     # DIE Offset
+
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(Sections, Succeeded());
+  auto Ctx = DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true);
+  const DWARFDebugNames &DebugNames = Ctx->getDebugNames();
+  ASSERT_NE(DebugNames.begin(), DebugNames.end());
+  const DWARFDebugNames::NameIndex &NameIndex = *DebugNames.begin();
+
+  ASSERT_EQ(NameIndex.getNameCount(), 2u);
+  ASSERT_EQ(NameIndex.getBucketCount(), 0u);
+  ASSERT_EQ(NameIndex.getCUCount(), 1u);
+  ASSERT_EQ(NameIndex.getCUOffset(0), 0u);
+  ASSERT_EQ(NameIndex.getForeignTUCount(), 0u);
+  ASSERT_EQ(NameIndex.getLocalTUCount(), 0u);
+
+  // Check "NameEntries": there should be one per unique name.
+  // These are indexed starting on 1.
+  DWARFDebugNames::NameTableEntry FirstEntry = NameIndex.getNameTableEntry(1);
+  ASSERT_EQ(FirstEntry.getString(), StringRef("NameType1"));
+  DWARFDebugNames::NameTableEntry SecondEntry = NameIndex.getNameTableEntry(2);
+  ASSERT_EQ(SecondEntry.getString(), StringRef("NameType2"));
+
+  SmallVector<DWARFDebugNames::Entry> FirstNameEntries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("NameType1"));
+  ASSERT_EQ(FirstNameEntries.size(), 2u);
+  ASSERT_EQ(FirstNameEntries[0].getCUIndex(), 0u);
+  ASSERT_EQ(FirstNameEntries[1].getCUIndex(), 0x2);
+  ASSERT_EQ(FirstNameEntries[0].getDIEUnitOffset(), 0x0);
+  ASSERT_EQ(FirstNameEntries[1].getDIEUnitOffset(), 0x2);
+
+  SmallVector<DWARFDebugNames::Entry> SecondNameEntries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("NameType2"));
+  ASSERT_EQ(SecondNameEntries.size(), 1u);
+  ASSERT_EQ(SecondNameEntries[0].getCUIndex(), 0x1);
+  ASSERT_EQ(SecondNameEntries[0].getDIEUnitOffset(), 0x1);
+}
+
+TEST(DWARFDebugNames, ParentEntries) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'Name1'
+    - 'Name2'
+    - 'Name3'
+    - 'Name4'
+  debug_names:
+    Abbreviations:
+    - Code:   0x11
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_parent
+          Form:  DW_FORM_flag_present
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    - Code:   0x22
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_parent
+          Form:  DW_FORM_ref4
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    - Code:   0x33
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_die_offset
+          Form:  DW_FORM_ref4
+    Entries:
+    - Name:   0x0  # strp to Name1
+      Code:   0x11
+      Values:
+        - 0x0      # Die offset
+    - Name:   0x6  # strp to Name2
+      Code:   0x22
+      Values:
+        - 0x0      # Parent = First entry
+        - 0x1      # Die offset
+    - Name:   0xc  # strp to Name3
+      Code:   0x22
+      Values:
+        - 0x6      # Parent = Second entry
+        - 0x1      # Die offset
+    - Name:   0x12  # strp to Name4
+      Code:   0x33
+      Values:
+        - 0x1      # Die offset
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(Sections, Succeeded());
+  auto Ctx = DWARFContext::create(*Sections, 4, /*isLittleEndian=*/true);
+  const DWARFDebugNames &DebugNames = Ctx->getDebugNames();
+  ASSERT_NE(DebugNames.begin(), DebugNames.end());
+  const DWARFDebugNames::NameIndex &NameIndex = *DebugNames.begin();
+
+  SmallVector<DWARFDebugNames::Entry> Name1Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name1"));
+  ASSERT_EQ(Name1Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name1Parent =
+      Name1Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name1Parent, Succeeded());
+  ASSERT_EQ(*Name1Parent, std::nullopt); // Name1 has no parent
+
+  SmallVector<DWARFDebugNames::Entry> Name2Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name2"));
+  ASSERT_EQ(Name2Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name2Parent =
+      Name2Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name2Parent, Succeeded());
+  ASSERT_EQ((**Name2Parent).getDIEUnitOffset(), 0x0); // Name2 parent == Name1
+
+  SmallVector<DWARFDebugNames::Entry> Name3Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name3"));
+  ASSERT_EQ(Name3Entries.size(), 1u);
+  Expected<std::optional<DWARFDebugNames::Entry>> Name3Parent =
+      Name3Entries[0].getParentDIEEntry();
+  ASSERT_THAT_EXPECTED(Name3Parent, Succeeded());
+  ASSERT_EQ((**Name3Parent).getDIEUnitOffset(), 0x1); // Name3 parent == Name2
+
+  SmallVector<DWARFDebugNames::Entry> Name4Entries =
+      to_vector_of<DWARFDebugNames::Entry>(NameIndex.equal_range("Name4"));
+  ASSERT_EQ(Name4Entries.size(), 1u);
+  ASSERT_FALSE(Name4Entries[0].hasParentInformation());
+}
+
+TEST(DWARFDebugNames, InvalidAbbrevCode) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x123456
+      Values:
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections,
+      FailedWithMessage("did not find an Abbreviation for this code"));
+}
+
+TEST(DWARFDebugNames, InvalidNumOfValues) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_data4
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+        - 0x0      # Compile unit
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections, FailedWithMessage(
+                    "mismatch between provided and required number of values"));
+}
+
+TEST(DWARFDebugNames, UnsupportedForm) {
+  const char *Yamldata = R"(
+--- !ELF
+  debug_str:
+    - 'NameType1'
+
+  debug_names:
+    Abbreviations:
+    - Code:   0x1
+      Tag: DW_TAG_namespace
+      Indices:
+        - Idx:   DW_IDX_compile_unit
+          Form:  DW_FORM_strx
+    Entries:
+    - Name:   0x0  # strp to NameType1
+      Code:   0x1
+      Values:
+        - 0x0      # Compile unit
+)";
+
+  Expected<StringMap<std::unique_ptr<MemoryBuffer>>> Sections =
+      DWARFYAML::emitDebugSections(Yamldata,
+                                   /*IsLittleEndian=*/true,
+                                   /*Is64BitAddrSize=*/true);
+  ASSERT_THAT_EXPECTED(
+      Sections,
+      FailedWithMessage("unsupported Form for YAML debug_names emitter"));
+}
 } // end anonymous namespace
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index f7ee7693ac2e..621b6feac8d2 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/GSYM/GsymReader.h"
 #include "llvm/DebugInfo/GSYM/Header.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
 #include "llvm/DebugInfo/GSYM/StringTable.h"
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Support/DataExtractor.h"
@@ -971,9 +972,10 @@ TEST(GSYMTest, TestGsymCreatorEncodeErrors) {
                              "GsymCreator wasn't finalized prior to encoding");
   std::string finalizeIssues;
   raw_string_ostream OS(finalizeIssues);
-  llvm::Error finalizeErr = GC.finalize(OS);
+  OutputAggregator Agg(&OS);
+  llvm::Error finalizeErr = GC.finalize(Agg);
   ASSERT_FALSE(bool(finalizeErr));
-  finalizeErr = GC.finalize(OS);
+  finalizeErr = GC.finalize(Agg);
   ASSERT_TRUE(bool(finalizeErr));
   checkError("already finalized", std::move(finalizeErr));
   // Verify we get an error trying to encode a GsymCreator with a UUID that is
@@ -1043,7 +1045,8 @@ TEST(GSYMTest, TestGsymCreator1ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x00, 0x10, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x20, 0x10, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1065,7 +1068,8 @@ TEST(GSYMTest, TestGsymCreator2ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x200, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1087,7 +1091,8 @@ TEST(GSYMTest, TestGsymCreator4ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x20000, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1109,7 +1114,8 @@ TEST(GSYMTest, TestGsymCreator8ByteAddrOffsets) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x000, 0x100, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr+0x100000000, 0x100, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -1148,7 +1154,8 @@ TEST(GSYMTest, TestGsymReader) {
   const auto ByteOrder = llvm::endianness::native;
   GC.addFunctionInfo(FunctionInfo(Func1Addr, FuncSize, Func1Name));
   GC.addFunctionInfo(FunctionInfo(Func2Addr, FuncSize, Func2Name));
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   ASSERT_FALSE(FinalizeErr);
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -1213,7 +1220,8 @@ TEST(GSYMTest, TestGsymLookups) {
   Inline3.Ranges.insert(AddressRange(0x1016, 0x1018));
   FI.Inline->Children.emplace_back(Inline3);
   GC.addFunctionInfo(std::move(FI));
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   ASSERT_FALSE(FinalizeErr);
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -1329,11 +1337,12 @@ TEST(GSYMTest, TestDWARFFunctionWithAddresses) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1406,11 +1415,12 @@ TEST(GSYMTest, TestDWARFFunctionWithAddressAndOffset) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1513,11 +1523,12 @@ TEST(GSYMTest, TestDWARFStructMethodNoMangled) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1613,6 +1624,7 @@ TEST(GSYMTest, TestDWARFTextRanges) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   // Only allow addresses between [0x1000 - 0x2000) to be linked into the
@@ -1621,8 +1633,8 @@ TEST(GSYMTest, TestDWARFTextRanges) {
   TextRanges.insert(AddressRange(0x1000, 0x2000));
   GC.SetValidTextRanges(TextRanges);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1650,8 +1662,8 @@ TEST(GSYMTest, TestEmptySymbolEndAddressOfTextRanges) {
   TextRanges.insert(AddressRange(0x1000, 0x2000));
   GC.SetValidTextRanges(TextRanges);
   GC.addFunctionInfo(FunctionInfo(0x1500, 0, GC.insertString("symbol")));
-  auto &OS = llvm::nulls();
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  OutputAggregator Null(nullptr);
+  ASSERT_THAT_ERROR(GC.finalize(Null), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -1816,11 +1828,12 @@ TEST(GSYMTest, TestDWARFInlineInfo) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2076,11 +2089,12 @@ TEST(GSYMTest, TestDWARFNoLines) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2255,11 +2269,12 @@ TEST(GSYMTest, TestDWARFDeadStripAddr4) {
       DWARFContext::create(*ErrOrSections, 4);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2395,11 +2410,12 @@ TEST(GSYMTest, TestDWARFDeadStripAddr8) {
       DWARFContext::create(*ErrOrSections, 8);
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   auto &OS = llvm::nulls();
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -2430,7 +2446,8 @@ TEST(GSYMTest, TestGsymCreatorMultipleSymbolsWithNoSize) {
   const uint32_t Func2Name = GC.insertString("bar");
   GC.addFunctionInfo(FunctionInfo(BaseAddr, 0, Func1Name));
   GC.addFunctionInfo(FunctionInfo(BaseAddr, 0, Func2Name));
-  Error Err = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error Err = GC.finalize(Null);
   ASSERT_FALSE(Err);
   TestEncodeDecode(GC, llvm::endianness::little, GSYM_VERSION, AddrOffSize,
                    BaseAddr,
@@ -2485,7 +2502,8 @@ static void AddFunctionInfo(GsymCreator &GC, const char *FuncName,
 // Finalize a GsymCreator, encode it and decode it and return the error or
 // GsymReader that was successfully decoded.
 static Expected<GsymReader> FinalizeEncodeAndDecode(GsymCreator &GC) {
-  Error FinalizeErr = GC.finalize(llvm::nulls());
+  OutputAggregator Null(nullptr);
+  Error FinalizeErr = GC.finalize(Null);
   if (FinalizeErr)
     return std::move(FinalizeErr);
   SmallString<1024> Str;
@@ -3033,11 +3051,12 @@ TEST(GSYMTest, TestDWARFInlineRangeScopes) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3260,11 +3279,12 @@ TEST(GSYMTest, TestDWARFEmptyInline) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3496,11 +3516,12 @@ TEST(GSYMTest, TestFinalizeForLineTables) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
   const auto ByteOrder = llvm::endianness::native;
@@ -3775,11 +3796,12 @@ TEST(GSYMTest, TestRangeWarnings) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -3977,11 +3999,12 @@ TEST(GSYMTest, TestEmptyRangeWarnings) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4129,11 +4152,12 @@ TEST(GSYMTest, TestEmptyLinkageName) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4290,11 +4314,12 @@ TEST(GSYMTest, TestLineTablesWithEmptyRanges) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4610,11 +4635,12 @@ TEST(GSYMTest, TestHandlingOfInvalidFileIndexes) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
@@ -4825,11 +4851,12 @@ TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) {
   ASSERT_TRUE(DwarfContext.get() != nullptr);
   std::string errors;
   raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
   GsymCreator GC;
   DwarfTransformer DT(*DwarfContext, GC);
   const uint32_t ThreadCount = 1;
-  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
-  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index edd12ebb62e1..7ab3e40df745 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -189,8 +189,8 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
       return ExecutorAddr::fromPtr((void *)nullptr);
     }
 
-    Expected<std::vector<tpctypes::LookupResult>>
-    lookupSymbols(ArrayRef<LookupRequest> Request) override {
+    void lookupSymbolsAsync(ArrayRef<LookupRequest> Request,
+                            SymbolLookupCompleteFn Complete) override {
       std::vector<ExecutorSymbolDef> Result;
       EXPECT_EQ(Request.size(), 1u);
       for (auto &LR : Request) {
@@ -205,7 +205,7 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
           }
         }
       }
-      return std::vector<tpctypes::LookupResult>{1, Result};
+      Complete(std::vector<tpctypes::LookupResult>{1, Result});
     }
   };
 
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index 827b4a9c0cc3..53b191c68838 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -27,11 +27,6 @@ using namespace llvm;
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
-// None of these tests are meaningful or do anything if we do not have the
-// experimental "head" bit compiled into ilist_iterator (aka
-// ilist_iterator_w_bits), thus there's no point compiling these tests in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -1476,6 +1471,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   // ... except for some dangling DPValues.
   EXPECT_NE(Exit.getTrailingDPValues(), nullptr);
   EXPECT_FALSE(Exit.getTrailingDPValues()->empty());
+  Exit.getTrailingDPValues()->eraseFromParent();
   Exit.deleteTrailingDPValues();
 
   UseNewDbgInfoFormat = false;
@@ -1534,4 +1530,3 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
 }
 
 } // End anonymous namespace.
-#endif // EXPERIMENTAL_DEBUGINFO_ITERATORS
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index e505af5d3275..34a162a5514e 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2565,6 +2565,12 @@ TEST_F(ConstantRangeTest, binaryXor) {
   EXPECT_EQ(R16_35.binaryXor(R0_99), ConstantRange(APInt(8, 0), APInt(8, 128)));
   EXPECT_EQ(R0_99.binaryXor(R16_35), ConstantRange(APInt(8, 0), APInt(8, 128)));
 
+  // Treat xor A, B as sub nsw nuw A, B
+  ConstantRange R0_51(APInt(8, 0), APInt(8, 51));
+  ConstantRange R63(APInt(8, 63));
+  EXPECT_EQ(R0_51.binaryXor(R63), ConstantRange(APInt(8, 13), APInt(8, 64)));
+  EXPECT_EQ(R63.binaryXor(R0_51), ConstantRange(APInt(8, 13), APInt(8, 64)));
+
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryXor(CR2);
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 314264b77a71..1d6a92c498b0 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -736,5 +736,44 @@ TEST(ConstantsTest, ComdatUserTracking) {
   EXPECT_TRUE(Users.size() == 0);
 }
 
+// Verify that the C API getters for BlockAddress work
+TEST(ConstantsTest, BlockAddressCAPITest) {
+  const char *BlockAddressIR = R"(
+    define void @test_block_address_func() {
+    entry:
+      br label %block_bb_0
+    block_bb_0:
+      ret void
+    }
+  )";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  std::unique_ptr<Module> M =
+      parseAssemblyString(BlockAddressIR, Error, Context);
+
+  EXPECT_TRUE(M.get() != nullptr);
+
+  // Get the function
+  auto *Func = M->getFunction("test_block_address_func");
+  EXPECT_TRUE(Func != nullptr);
+
+  // Get the second basic block, since we can't use the entry one
+  const BasicBlock &BB = *(++Func->begin());
+  EXPECT_EQ(BB.getName(), "block_bb_0");
+
+  // Construct the C API values
+  LLVMValueRef BlockAddr = LLVMBlockAddress(wrap(Func), wrap(&BB));
+  EXPECT_TRUE(LLVMIsABlockAddress(BlockAddr));
+
+  // Get the Function/BasicBlock values back out
+  auto *OutFunc = unwrap(LLVMGetBlockAddressFunction(BlockAddr));
+  auto *OutBB = unwrap(LLVMGetBlockAddressBasicBlock(BlockAddr));
+
+  // Verify that they round-tripped properly
+  EXPECT_EQ(Func, OutFunc);
+  EXPECT_EQ(&BB, OutBB);
+}
+
 } // end anonymous namespace
 } // end namespace llvm
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 885b1346cde1..883149c686b4 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -1889,4 +1889,26 @@ TEST_F(PatternMatchTest, ConstExpr) {
   EXPECT_TRUE(match(V, m_ConstantExpr()));
 }
 
+TEST_F(PatternMatchTest, PtrAdd) {
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *IdxTy = Type::getInt64Ty(Ctx);
+  Constant *Null = Constant::getNullValue(PtrTy);
+  Constant *Offset = ConstantInt::get(IdxTy, 42);
+  Value *PtrAdd = IRB.CreatePtrAdd(Null, Offset);
+  Value *OtherGEP = IRB.CreateGEP(IdxTy, Null, Offset);
+  Value *PtrAddConst =
+      ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Null, Offset);
+
+  Value *A, *B;
+  EXPECT_TRUE(match(PtrAdd, m_PtrAdd(m_Value(A), m_Value(B))));
+  EXPECT_EQ(A, Null);
+  EXPECT_EQ(B, Offset);
+
+  EXPECT_TRUE(match(PtrAddConst, m_PtrAdd(m_Value(A), m_Value(B))));
+  EXPECT_EQ(A, Null);
+  EXPECT_EQ(B, Offset);
+
+  EXPECT_FALSE(match(OtherGEP, m_PtrAdd(m_Value(A), m_Value(B))));
+}
+
 } // anonymous namespace.
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 2849781a9dc4..6f6718fbd945 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -197,8 +197,7 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeDecisionRegion(
-        CounterMappingRegion::MCDCParameters{Mask, NC}, FileID, LS, CS, LE,
-        CE));
+        mcdc::DecisionParameters{Mask, NC}, FileID, LS, CS, LE, CE));
   }
 
   void addMCDCBranchCMR(Counter C1, Counter C2, unsigned ID, unsigned TrueID,
@@ -207,8 +206,8 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
     auto &Regions = InputFunctions.back().Regions;
     unsigned FileID = getFileIndexForFunction(File);
     Regions.push_back(CounterMappingRegion::makeBranchRegion(
-        C1, C2, CounterMappingRegion::MCDCParameters{0, 0, ID, TrueID, FalseID},
-        FileID, LS, CS, LE, CE));
+        C1, C2, FileID, LS, CS, LE, CE,
+        mcdc::BranchParameters{ID, TrueID, FalseID}));
   }
 
   void addExpansionCMR(StringRef File, StringRef ExpandedFile, unsigned LS,
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 5836239bc56f..5f3ce53f5d27 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -94,6 +94,69 @@ TEST_P(RISCVInstrInfoTest, IsAddImmediate) {
   }
 }
 
+TEST_P(RISCVInstrInfoTest, IsCopyInstrImpl) {
+  const RISCVInstrInfo *TII = ST->getInstrInfo();
+  DebugLoc DL;
+
+  // ADDI.
+
+  MachineInstr *MI1 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addImm(-128)
+                          .getInstr();
+  auto MI1Res = TII->isCopyInstrImpl(*MI1);
+  EXPECT_FALSE(MI1Res.has_value());
+
+  MachineInstr *MI2 = BuildMI(*MF, DL, TII->get(RISCV::ADDI), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addImm(0)
+                          .getInstr();
+  auto MI2Res = TII->isCopyInstrImpl(*MI2);
+  ASSERT_TRUE(MI2Res.has_value());
+  EXPECT_EQ(MI2Res->Destination->getReg(), RISCV::X1);
+  EXPECT_EQ(MI2Res->Source->getReg(), RISCV::X2);
+
+  // Partial coverage of FSGNJ_* instructions.
+
+  MachineInstr *MI3 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D)
+                          .addReg(RISCV::F2_D)
+                          .addReg(RISCV::F1_D)
+                          .getInstr();
+  auto MI3Res = TII->isCopyInstrImpl(*MI3);
+  EXPECT_FALSE(MI3Res.has_value());
+
+  MachineInstr *MI4 = BuildMI(*MF, DL, TII->get(RISCV::FSGNJ_D), RISCV::F1_D)
+                          .addReg(RISCV::F2_D)
+                          .addReg(RISCV::F2_D)
+                          .getInstr();
+  auto MI4Res = TII->isCopyInstrImpl(*MI4);
+  ASSERT_TRUE(MI4Res.has_value());
+  EXPECT_EQ(MI4Res->Destination->getReg(), RISCV::F1_D);
+  EXPECT_EQ(MI4Res->Source->getReg(), RISCV::F2_D);
+
+  // ADD. TODO: Should return true for add reg, x0 and add x0, reg.
+  MachineInstr *MI5 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addReg(RISCV::X3)
+                          .getInstr();
+  auto MI5Res = TII->isCopyInstrImpl(*MI5);
+  EXPECT_FALSE(MI5Res.has_value());
+
+  MachineInstr *MI6 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X0)
+                          .addReg(RISCV::X2)
+                          .getInstr();
+  auto MI6Res = TII->isCopyInstrImpl(*MI6);
+  EXPECT_FALSE(MI6Res.has_value());
+
+  MachineInstr *MI7 = BuildMI(*MF, DL, TII->get(RISCV::ADD), RISCV::X1)
+                          .addReg(RISCV::X2)
+                          .addReg(RISCV::X0)
+                          .getInstr();
+  auto MI7Res = TII->isCopyInstrImpl(*MI7);
+  EXPECT_FALSE(MI7Res.has_value());
+}
+
 TEST_P(RISCVInstrInfoTest, GetMemOperandsWithOffsetWidth) {
   const RISCVInstrInfo *TII = ST->getInstrInfo();
   const TargetRegisterInfo *TRI = ST->getRegisterInfo();
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index 5f151616d7ca..6aa1d7a087eb 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -122,6 +122,9 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
                                               "CPU part        : 0xac4"),
             "ampere1a");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n"
+                                              "CPU part        : 0xac5"),
+            "ampere1b");
 
   // MSM8992/4 weirdness
   StringRef MSM8992ProcCpuInfo = R"(
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index e7f997304eb4..c6ee39fa4160 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -976,6 +976,14 @@ TEST(TargetParserTest, ARMparseArchVersion) {
       EXPECT_EQ(5u, ARM::parseArchVersion(ARMArch[i]));
 }
 
+TEST(TargetParserTest, ARMparseArchMinorVersion) {
+  for (unsigned i = 0; i < std::size(ARMArch); i++)
+    if (((std::string)ARMArch[i]).find(".") == 5)
+      EXPECT_EQ((ARMArch[i][6] - 48u), ARM::parseArchMinorVersion(ARMArch[i]));
+    else
+      EXPECT_EQ(0u, ARM::parseArchMinorVersion(ARMArch[i]));
+}
+
 TEST(TargetParserTest, getARMCPUForArch) {
   // Platform specific defaults.
   {
@@ -1601,6 +1609,18 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH})),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "ampere1b", "armv8.7-a", "crypto-neon-fp-armv8",
+            (AArch64::ExtensionBitset(
+                {AArch64::AEK_CRC,   AArch64::AEK_FP,    AArch64::AEK_FP16,
+                 AArch64::AEK_SIMD,  AArch64::AEK_RAS,   AArch64::AEK_LSE,
+                 AArch64::AEK_RDM,   AArch64::AEK_RCPC,  AArch64::AEK_DOTPROD,
+                 AArch64::AEK_SM4,   AArch64::AEK_SHA3,  AArch64::AEK_BF16,
+                 AArch64::AEK_SHA2,  AArch64::AEK_AES,   AArch64::AEK_I8MM,
+                 AArch64::AEK_SSBS,  AArch64::AEK_SB,    AArch64::AEK_RAND,
+                 AArch64::AEK_MTE,   AArch64::AEK_JSCVT, AArch64::AEK_FCMA,
+                 AArch64::AEK_PAUTH, AArch64::AEK_CSSC})),
+            "8.7-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8",
             (AArch64::ExtensionBitset(
                 {AArch64::AEK_RAS,     AArch64::AEK_SVE,   AArch64::AEK_SSBS,
@@ -1679,7 +1699,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 68;
+static constexpr unsigned NumAArch64CPUArchs = 69;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 011d96a48d61..2c2c39a3ef54 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -140,10 +140,11 @@ class AsmMatcherInfo;
 // RegisterSets can be seen in the outputted AsmMatcher tables occasionally, and
 // can even affect compiler output (at least seen in diagnostics produced when
 // all matches fail). So we use a type that sorts them consistently.
-typedef std::set<Record*, LessRecordByID> RegisterSet;
+typedef std::set<Record *, LessRecordByID> RegisterSet;
 
 class AsmMatcherEmitter {
   RecordKeeper &Records;
+
 public:
   AsmMatcherEmitter(RecordKeeper &R) : Records(R) {}
 
@@ -166,7 +167,7 @@ struct ClassInfo {
 
     /// The (first) user defined class, subsequent user defined classes are
     /// UserClass0+1, and so on.
-    UserClass0 = 1<<16
+    UserClass0 = 1 << 16
   };
 
   /// Kind - The class kind, which is either a predefined kind, or (UserClass0 +
@@ -176,7 +177,7 @@ struct ClassInfo {
   /// SuperClasses - The super classes of this class. Note that for simplicities
   /// sake user operands only record their immediate super class, while register
   /// operands include all superclasses.
-  std::vector<ClassInfo*> SuperClasses;
+  std::vector<ClassInfo *> SuperClasses;
 
   /// Name - The full class name, suitable for use in an enum.
   std::string Name;
@@ -204,10 +205,12 @@ struct ClassInfo {
   /// For register classes: the records for all the registers in this class.
   RegisterSet Registers;
 
-  /// For custom match classes: the diagnostic kind for when the predicate fails.
+  /// For custom match classes: the diagnostic kind for when the predicate
+  /// fails.
   std::string DiagnosticType;
 
-  /// For custom match classes: the diagnostic string for when the predicate fails.
+  /// For custom match classes: the diagnostic string for when the predicate
+  /// fails.
   std::string DiagnosticString;
 
   /// Is this operand optional and not always required.
@@ -224,9 +227,7 @@ public:
   }
 
   /// isUserClass() - Check if this is a user defined class.
-  bool isUserClass() const {
-    return Kind >= UserClass0;
-  }
+  bool isUserClass() const { return Kind >= UserClass0; }
 
   /// isRelatedTo - Check whether this class is "related" to \p RHS. Classes
   /// are related if they are in the same class hierarchy.
@@ -241,11 +242,10 @@ public:
       if (!isRegisterClass() || !RHS.isRegisterClass())
         return false;
 
-      RegisterSet Tmp;
-      std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
+      std::vector<Record *> Tmp;
       std::set_intersection(Registers.begin(), Registers.end(),
                             RHS.Registers.begin(), RHS.Registers.end(),
-                            II, LessRecordByID());
+                            std::back_inserter(Tmp), LessRecordByID());
 
       return !Tmp.empty();
     }
@@ -469,7 +469,7 @@ struct MatchableInfo {
                                 unsigned SrcOperand2) {
       ResOperand X;
       X.Kind = TiedOperand;
-      X.TiedOperands = { TiedOperandNum, SrcOperand1, SrcOperand2 };
+      X.TiedOperands = {TiedOperandNum, SrcOperand1, SrcOperand2};
       X.MINumOperands = 1;
       return X;
     }
@@ -503,7 +503,7 @@ struct MatchableInfo {
   Record *const TheDef;
 
   /// DefRec - This is the definition that it came from.
-  PointerUnion<const CodeGenInstruction*, const CodeGenInstAlias*> DefRec;
+  PointerUnion<const CodeGenInstruction *, const CodeGenInstAlias *> DefRec;
 
   const CodeGenInstruction *getResultInst() const {
     if (isa<const CodeGenInstruction *>(DefRec))
@@ -542,16 +542,13 @@ struct MatchableInfo {
   bool UseInstAsmMatchConverter;
 
   MatchableInfo(const CodeGenInstruction &CGI)
-    : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef), DefRec(&CGI),
-      UseInstAsmMatchConverter(true) {
-  }
+      : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef),
+        DefRec(&CGI), UseInstAsmMatchConverter(true) {}
 
   MatchableInfo(std::unique_ptr<const CodeGenInstAlias> Alias)
-    : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
-      DefRec(Alias.release()),
-      UseInstAsmMatchConverter(
-        TheDef->getValueAsBit("UseInstAsmMatchConverter")) {
-  }
+      : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
+        DefRec(Alias.release()), UseInstAsmMatchConverter(TheDef->getValueAsBit(
+                                     "UseInstAsmMatchConverter")) {}
 
   // Could remove this and the dtor if PointerUnion supported unique_ptr
   // elements with a dynamic failure/assertion (like the one below) in the case
@@ -576,9 +573,8 @@ struct MatchableInfo {
   void formTwoOperandAlias(StringRef Constraint);
 
   void initialize(const AsmMatcherInfo &Info,
-                  SmallPtrSetImpl<Record*> &SingletonRegisters,
-                  AsmVariantInfo const &Variant,
-                  bool HasMnemonicFirst);
+                  SmallPtrSetImpl<Record *> &SingletonRegisters,
+                  AsmVariantInfo const &Variant, bool HasMnemonicFirst);
 
   /// validate - Return true if this matchable is a valid thing to match against
   /// and perform a bunch of validity checking.
@@ -603,9 +599,9 @@ struct MatchableInfo {
   }
 
   int findAsmOperandOriginallyNamed(StringRef N) const {
-    auto I =
-        find_if(AsmOperands,
-                [&](const AsmOperand &Op) { return Op.OrigSrcOpName == N; });
+    auto I = find_if(AsmOperands, [&](const AsmOperand &Op) {
+      return Op.OrigSrcOpName == N;
+    });
     return (I != AsmOperands.end()) ? I - AsmOperands.begin() : -1;
   }
 
@@ -706,7 +702,7 @@ private:
 
 struct OperandMatchEntry {
   unsigned OperandMask;
-  const MatchableInfo* MI;
+  const MatchableInfo *MI;
   ClassInfo *CI;
 
   static OperandMatchEntry create(const MatchableInfo *mi, ClassInfo *ci,
@@ -740,21 +736,21 @@ public:
   std::vector<OperandMatchEntry> OperandMatchInfo;
 
   /// Map of Register records to their class information.
-  typedef std::map<Record*, ClassInfo*, LessRecordByID> RegisterClassesTy;
+  typedef std::map<Record *, ClassInfo *, LessRecordByID> RegisterClassesTy;
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
   std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
 
   /// Map of AsmOperandClass records to their class information.
-  std::map<Record*, ClassInfo*> AsmOperandClasses;
+  std::map<Record *, ClassInfo *> AsmOperandClasses;
 
   /// Map of RegisterClass records to their class information.
-  std::map<Record*, ClassInfo*> RegisterClassClasses;
+  std::map<Record *, ClassInfo *> RegisterClassClasses;
 
 private:
   /// Map of token to class information which has already been constructed.
-  std::map<std::string, ClassInfo*> TokenClasses;
+  std::map<std::string, ClassInfo *> TokenClasses;
 
 private:
   /// getTokenClass - Lookup or create the class for the given token.
@@ -767,7 +763,7 @@ private:
 
   /// buildRegisterClasses - Build the ClassInfo* instances for register
   /// classes.
-  void buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters);
+  void buildRegisterClasses(SmallPtrSetImpl<Record *> &SingletonRegisters);
 
   /// buildOperandClasses - Build the ClassInfo* instances for user defined
   /// operand classes.
@@ -779,8 +775,7 @@ private:
                                   MatchableInfo::AsmOperand &Op);
 
 public:
-  AsmMatcherInfo(Record *AsmParser,
-                 CodeGenTarget &Target,
+  AsmMatcherInfo(Record *AsmParser, CodeGenTarget &Target,
                  RecordKeeper &Records);
 
   /// Construct the various tables used during matching.
@@ -798,9 +793,7 @@ public:
     return I == SubtargetFeatures.end() ? nullptr : &I->second;
   }
 
-  RecordKeeper &getRecords() const {
-    return Records;
-  }
+  RecordKeeper &getRecords() const { return Records; }
 
   bool hasOptionalOperands() const {
     return any_of(Classes,
@@ -812,7 +805,8 @@ public:
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MatchableInfo::dump() const {
-  errs() << TheDef->getName() << " -- " << "flattened:\"" << AsmString <<"\"\n";
+  errs() << TheDef->getName() << " -- "
+         << "flattened:\"" << AsmString << "\"\n";
 
   errs() << "  variant: " << AsmVariantID << "\n";
 
@@ -850,7 +844,7 @@ parseTwoOperandConstraint(StringRef S, ArrayRef<SMLoc> Loc) {
 void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   // Figure out which operands are aliased and mark them as tied.
   std::pair<StringRef, StringRef> Ops =
-    parseTwoOperandConstraint(Constraint, TheDef->getLoc());
+      parseTwoOperandConstraint(Constraint, TheDef->getLoc());
 
   // Find the AsmOperands that refer to the operands we're aliasing.
   int SrcAsmOperand = findAsmOperandNamed(Ops.first);
@@ -858,11 +852,11 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   if (SrcAsmOperand == -1)
     PrintFatalError(TheDef->getLoc(),
                     "unknown source two-operand alias operand '" + Ops.first +
-                    "'.");
+                        "'.");
   if (DstAsmOperand == -1)
     PrintFatalError(TheDef->getLoc(),
                     "unknown destination two-operand alias operand '" +
-                    Ops.second + "'.");
+                        Ops.second + "'.");
 
   // Find the ResOperand that refers to the operand we're aliasing away
   // and update it to refer to the combined operand instead.
@@ -878,7 +872,7 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   // Adjust the ResOperand references to any AsmOperands that followed
   // the one we just deleted.
   for (ResOperand &Op : ResOperands) {
-    switch(Op.Kind) {
+    switch (Op.Kind) {
     default:
       // Nothing to do for operands that don't reference AsmOperands.
       break;
@@ -892,10 +886,9 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
 
 /// extractSingletonRegisterForAsmOperand - Extract singleton register,
 /// if present, from specified token.
-static void
-extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
-                                      const AsmMatcherInfo &Info,
-                                      StringRef RegisterPrefix) {
+static void extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
+                                                  const AsmMatcherInfo &Info,
+                                                  StringRef RegisterPrefix) {
   StringRef Tok = Op.Token;
 
   // If this token is not an isolated token, i.e., it isn't separated from
@@ -922,13 +915,12 @@ extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
 }
 
 void MatchableInfo::initialize(const AsmMatcherInfo &Info,
-                               SmallPtrSetImpl<Record*> &SingletonRegisters,
+                               SmallPtrSetImpl<Record *> &SingletonRegisters,
                                AsmVariantInfo const &Variant,
                                bool HasMnemonicFirst) {
   AsmVariantID = Variant.AsmVariantNo;
-  AsmString =
-    CodeGenInstruction::FlattenAsmStringVariants(AsmString,
-                                                 Variant.AsmVariantNo);
+  AsmString = CodeGenInstruction::FlattenAsmStringVariants(
+      AsmString, Variant.AsmVariantNo);
 
   tokenizeAsmString(Info, Variant);
 
@@ -936,7 +928,7 @@ void MatchableInfo::initialize(const AsmMatcherInfo &Info,
   // simple string, not a $foo variable or a singleton register.
   if (AsmOperands.empty())
     PrintFatalError(TheDef->getLoc(),
-                  "Instruction '" + TheDef->getName() + "' has no tokens");
+                    "Instruction '" + TheDef->getName() + "' has no tokens");
 
   assert(!AsmOperands[0].Token.empty());
   if (HasMnemonicFirst) {
@@ -1045,7 +1037,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       size_t EndPos = String.find('}', i);
       assert(EndPos != StringRef::npos &&
              "Missing brace in operand reference!");
-      addAsmOperand(String.slice(i, EndPos+1), IsIsolatedToken);
+      addAsmOperand(String.slice(i, EndPos + 1), IsIsolatedToken);
       Prev = EndPos + 1;
       i = EndPos;
       IsIsolatedToken = false;
@@ -1070,16 +1062,16 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   // isCodeGenOnly if they are pseudo instructions.
   if (AsmString.find('\n') != std::string::npos)
     PrintFatalError(TheDef->getLoc(),
-                  "multiline instruction is not valid for the asmparser, "
-                  "mark it isCodeGenOnly");
+                    "multiline instruction is not valid for the asmparser, "
+                    "mark it isCodeGenOnly");
 
   // Remove comments from the asm string.  We know that the asmstring only
   // has one line.
   if (!CommentDelimiter.empty() &&
       StringRef(AsmString).contains(CommentDelimiter))
     PrintFatalError(TheDef->getLoc(),
-                  "asmstring for instruction has comment character in it, "
-                  "mark it isCodeGenOnly");
+                    "asmstring for instruction has comment character in it, "
+                    "mark it isCodeGenOnly");
 
   // Reject matchables with operand modifiers, these aren't something we can
   // handle, the target should be refactored to use operands instead of
@@ -1092,17 +1084,17 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   for (const AsmOperand &Op : AsmOperands) {
     StringRef Tok = Op.Token;
     if (Tok[0] == '$' && Tok.contains(':'))
-      PrintFatalError(TheDef->getLoc(),
-                      "matchable with operand modifier '" + Tok +
-                      "' not supported by asm matcher.  Mark isCodeGenOnly!");
+      PrintFatalError(
+          TheDef->getLoc(),
+          "matchable with operand modifier '" + Tok +
+              "' not supported by asm matcher.  Mark isCodeGenOnly!");
     // Verify that any operand is only mentioned once.
     // We reject aliases and ignore instructions for now.
     if (!IsAlias && TheDef->getValueAsString("AsmMatchConverter").empty() &&
         Tok[0] == '$' && !OperandNames.insert(std::string(Tok)).second) {
       LLVM_DEBUG({
         errs() << "warning: '" << TheDef->getName() << "': "
-               << "ignoring instruction with tied operand '"
-               << Tok << "'\n";
+               << "ignoring instruction with tied operand '" << Tok << "'\n";
       });
       return false;
     }
@@ -1116,15 +1108,33 @@ static std::string getEnumNameForToken(StringRef Str) {
 
   for (char C : Str) {
     switch (C) {
-    case '*': Res += "_STAR_"; break;
-    case '%': Res += "_PCT_"; break;
-    case ':': Res += "_COLON_"; break;
-    case '!': Res += "_EXCLAIM_"; break;
-    case '.': Res += "_DOT_"; break;
-    case '<': Res += "_LT_"; break;
-    case '>': Res += "_GT_"; break;
-    case '-': Res += "_MINUS_"; break;
-    case '#': Res += "_HASH_"; break;
+    case '*':
+      Res += "_STAR_";
+      break;
+    case '%':
+      Res += "_PCT_";
+      break;
+    case ':':
+      Res += "_COLON_";
+      break;
+    case '!':
+      Res += "_EXCLAIM_";
+      break;
+    case '.':
+      Res += "_DOT_";
+      break;
+    case '<':
+      Res += "_LT_";
+      break;
+    case '>':
+      Res += "_GT_";
+      break;
+    case '-':
+      Res += "_MINUS_";
+      break;
+    case '#':
+      Res += "_HASH_";
+      break;
     default:
       if (isAlnum(C))
         Res += C;
@@ -1166,8 +1176,7 @@ AsmMatcherInfo::getOperandClass(const CGIOperandList::OperandInfo &OI,
   return getOperandClass(Rec, SubOpIdx);
 }
 
-ClassInfo *
-AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
+ClassInfo *AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
   if (Rec->isSubClassOf("RegisterOperand")) {
     // RegisterOperand may have an associated ParserMatchClass. If it does,
     // use it, else just fall back to the underlying register class.
@@ -1177,7 +1186,7 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
                       "Record `" + Rec->getName() +
                           "' does not have a ParserMatchClass!\n");
 
-    if (DefInit *DI= dyn_cast<DefInit>(R->getValue())) {
+    if (DefInit *DI = dyn_cast<DefInit>(R->getValue())) {
       Record *MatchClass = DI->getDef();
       if (ClassInfo *CI = AsmOperandClasses[MatchClass])
         return CI;
@@ -1186,8 +1195,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     // No custom match class. Just use the register class.
     Record *ClassRec = Rec->getValueAsDef("RegClass");
     if (!ClassRec)
-      PrintFatalError(Rec->getLoc(), "RegisterOperand `" + Rec->getName() +
-                    "' has no associated register class!\n");
+      PrintFatalError(Rec->getLoc(),
+                      "RegisterOperand `" + Rec->getName() +
+                          "' has no associated register class!\n");
     if (ClassInfo *CI = RegisterClassClasses[ClassRec])
       return CI;
     PrintFatalError(Rec->getLoc(), "register class has no class info!");
@@ -1200,8 +1210,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
   }
 
   if (!Rec->isSubClassOf("Operand"))
-    PrintFatalError(Rec->getLoc(), "Operand `" + Rec->getName() +
-                  "' does not derive from class Operand!\n");
+    PrintFatalError(Rec->getLoc(),
+                    "Operand `" + Rec->getName() +
+                        "' does not derive from class Operand!\n");
   Record *MatchClass = Rec->getValueAsDef("ParserMatchClass");
   if (ClassInfo *CI = AsmOperandClasses[MatchClass])
     return CI;
@@ -1210,19 +1221,18 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
 }
 
 struct LessRegisterSet {
-  bool operator() (const RegisterSet &LHS, const RegisterSet & RHS) const {
+  bool operator()(const RegisterSet &LHS, const RegisterSet &RHS) const {
     // std::set<T> defines its own compariso "operator<", but it
     // performs a lexicographical comparison by T's innate comparison
     // for some reason. We don't want non-deterministic pointer
     // comparisons so use this instead.
-    return std::lexicographical_compare(LHS.begin(), LHS.end(),
-                                        RHS.begin(), RHS.end(),
-                                        LessRecordByID());
+    return std::lexicographical_compare(LHS.begin(), LHS.end(), RHS.begin(),
+                                        RHS.end(), LessRecordByID());
   }
 };
 
-void AsmMatcherInfo::
-buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
+void AsmMatcherInfo::buildRegisterClasses(
+    SmallPtrSetImpl<Record *> &SingletonRegisters) {
   const auto &Registers = Target.getRegBank().getRegisters();
   auto &RegClassList = Target.getRegBank().getRegClasses();
 
@@ -1244,7 +1254,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   // Introduce derived sets where necessary (when a register does not determine
   // a unique register set class), and build the mapping of registers to the set
   // they should classify to.
-  std::map<Record*, RegisterSet> RegisterMap;
+  std::map<Record *, RegisterSet> RegisterMap;
   for (const CodeGenRegister &CGR : Registers) {
     // Compute the intersection of all sets containing this register.
     RegisterSet ContainingSet;
@@ -1259,11 +1269,10 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
       }
 
       RegisterSet Tmp;
-      std::swap(Tmp, ContainingSet);
-      std::insert_iterator<RegisterSet> II(ContainingSet,
-                                           ContainingSet.begin());
-      std::set_intersection(Tmp.begin(), Tmp.end(), RS.begin(), RS.end(), II,
-                            LessRecordByID());
+      std::set_intersection(ContainingSet.begin(), ContainingSet.end(),
+                            RS.begin(), RS.end(),
+                            std::inserter(Tmp, Tmp.begin()), LessRecordByID());
+      ContainingSet = std::move(Tmp);
     }
 
     if (!ContainingSet.empty()) {
@@ -1273,7 +1282,7 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   }
 
   // Construct the register classes.
-  std::map<RegisterSet, ClassInfo*, LessRegisterSet> RegisterSetClasses;
+  std::map<RegisterSet, ClassInfo *, LessRegisterSet> RegisterSetClasses;
   unsigned Index = 0;
   for (const RegisterSet &RS : RegisterSets) {
     Classes.emplace_front();
@@ -1298,9 +1307,8 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   for (const RegisterSet &RS : RegisterSets) {
     ClassInfo *CI = RegisterSetClasses[RS];
     for (const RegisterSet &RS2 : RegisterSets)
-      if (RS != RS2 &&
-          std::includes(RS2.begin(), RS2.end(), RS.begin(), RS.end(),
-                        LessRecordByID()))
+      if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(),
+                                     RS.end(), LessRecordByID()))
         CI->SuperClasses.push_back(RegisterSetClasses[RS2]);
   }
 
@@ -1354,8 +1362,8 @@ buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
 }
 
 void AsmMatcherInfo::buildOperandClasses() {
-  std::vector<Record*> AsmOperands =
-    Records.getAllDerivedDefinitions("AsmOperandClass");
+  std::vector<Record *> AsmOperands =
+      Records.getAllDerivedDefinitions("AsmOperandClass");
 
   // Pre-populate AsmOperandClasses map.
   for (Record *Rec : AsmOperands) {
@@ -1438,11 +1446,9 @@ void AsmMatcherInfo::buildOperandClasses() {
   }
 }
 
-AsmMatcherInfo::AsmMatcherInfo(Record *asmParser,
-                               CodeGenTarget &target,
+AsmMatcherInfo::AsmMatcherInfo(Record *asmParser, CodeGenTarget &target,
                                RecordKeeper &records)
-  : Records(records), AsmParser(asmParser), Target(target) {
-}
+    : Records(records), AsmParser(asmParser), Target(target) {}
 
 /// buildOperandMatchInfo - Build the necessary information to handle user
 /// defined operand parsing methods.
@@ -1476,8 +1482,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
     for (const auto &OCM : OpClassMask) {
       unsigned OpMask = OCM.second;
       ClassInfo *CI = OCM.first;
-      OperandMatchInfo.push_back(OperandMatchEntry::create(MI.get(), CI,
-                                                           OpMask));
+      OperandMatchInfo.push_back(
+          OperandMatchEntry::create(MI.get(), CI, OpMask));
     }
   }
 }
@@ -1499,7 +1505,7 @@ void AsmMatcherInfo::buildInfo() {
 
   // Parse the instructions; we need to do this first so that we can gather the
   // singleton register classes.
-  SmallPtrSet<Record*, 16> SingletonRegisters;
+  SmallPtrSet<Record *, 16> SingletonRegisters;
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
@@ -1511,8 +1517,7 @@ void AsmMatcherInfo::buildInfo() {
         AsmVariant->getValueAsString("TokenizingCharacters");
     Variant.SeparatorCharacters =
         AsmVariant->getValueAsString("SeparatorCharacters");
-    Variant.BreakCharacters =
-        AsmVariant->getValueAsString("BreakCharacters");
+    Variant.BreakCharacters = AsmVariant->getValueAsString("BreakCharacters");
     Variant.Name = AsmVariant->getValueAsString("Name");
     Variant.AsmVariantNo = AsmVariant->getValueAsInt("Variant");
 
@@ -1546,8 +1551,8 @@ void AsmMatcherInfo::buildInfo() {
 
     // Parse all of the InstAlias definitions and stick them in the list of
     // matchables.
-    std::vector<Record*> AllInstAliases =
-      Records.getAllDerivedDefinitions("InstAlias");
+    std::vector<Record *> AllInstAliases =
+        Records.getAllDerivedDefinitions("InstAlias");
     for (Record *InstAlias : AllInstAliases) {
       auto Alias = std::make_unique<CodeGenInstAlias>(InstAlias, Target);
 
@@ -1654,14 +1659,14 @@ void AsmMatcherInfo::buildInfo() {
 
   // Process token alias definitions and set up the associated superclass
   // information.
-  std::vector<Record*> AllTokenAliases =
-    Records.getAllDerivedDefinitions("TokenAlias");
+  std::vector<Record *> AllTokenAliases =
+      Records.getAllDerivedDefinitions("TokenAlias");
   for (Record *Rec : AllTokenAliases) {
     ClassInfo *FromClass = getTokenClass(Rec->getValueAsString("FromToken"));
     ClassInfo *ToClass = getTokenClass(Rec->getValueAsString("ToToken"));
     if (FromClass == ToClass)
       PrintFatalError(Rec->getLoc(),
-                    "error: Destination value identical to source value.");
+                      "error: Destination value identical to source value.");
     FromClass->SuperClasses.push_back(ToClass);
   }
 
@@ -1681,10 +1686,9 @@ void AsmMatcherInfo::buildInfo() {
 
 /// buildInstructionOperandReference - The specified operand is a reference to a
 /// named operand such as $src.  Resolve the Class and OperandInfo pointers.
-void AsmMatcherInfo::
-buildInstructionOperandReference(MatchableInfo *II,
-                                 StringRef OperandName,
-                                 unsigned AsmOpIdx) {
+void AsmMatcherInfo::buildInstructionOperandReference(MatchableInfo *II,
+                                                      StringRef OperandName,
+                                                      unsigned AsmOpIdx) {
   const CodeGenInstruction &CGI = *cast<const CodeGenInstruction *>(II->DefRec);
   const CGIOperandList &Operands = CGI.Operands;
   MatchableInfo::AsmOperand *Op = &II->AsmOperands[AsmOpIdx];
@@ -1708,7 +1712,8 @@ buildInstructionOperandReference(MatchableInfo *II,
       for (unsigned SI = 1, SE = Operands[Idx].MINumOperands; SI != SE; ++SI) {
         MatchableInfo::AsmOperand NewAsmOp(/*IsIsolatedToken=*/true, Token);
         NewAsmOp.SubOpIdx = SI;
-        II->AsmOperands.insert(II->AsmOperands.begin()+AsmOpIdx+SI, NewAsmOp);
+        II->AsmOperands.insert(II->AsmOperands.begin() + AsmOpIdx + SI,
+                               NewAsmOp);
       }
       // Replace Op with first suboperand.
       Op = &II->AsmOperands[AsmOpIdx]; // update the pointer in case it moved
@@ -1760,8 +1765,8 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II,
       // Use the match class from the Alias definition, not the
       // destination instruction, as we may have an immediate that's
       // being munged by the match class.
-      Op.Class = getOperandClass(CGA.ResultOperands[i].getRecord(),
-                                 Op.SubOpIdx);
+      Op.Class =
+          getOperandClass(CGA.ResultOperands[i].getRecord(), Op.SubOpIdx);
       Op.SrcOpName = OperandName;
       Op.OrigSrcOpName = OperandName;
       return;
@@ -1812,8 +1817,8 @@ void MatchableInfo::buildInstructionResultOperands() {
 
     // Add a separate ResOperand for each suboperand.
     for (unsigned AI = 0; AI < NumOperands; ++AI) {
-      assert(AsmOperands[SrcOperand+AI].SubOpIdx == (int)AI &&
-             AsmOperands[SrcOperand+AI].SrcOpName == OpInfo.Name &&
+      assert(AsmOperands[SrcOperand + AI].SubOpIdx == (int)AI &&
+             AsmOperands[SrcOperand + AI].SrcOpName == OpInfo.Name &&
              "unexpected AsmOperands for suboperands");
       ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand + AI, 1));
     }
@@ -1886,8 +1891,9 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) {
 
     // Handle all the suboperands for this operand.
     const std::string &OpName = OpInfo->Name;
-    for ( ; AliasOpNo <  LastOpNo &&
-            CGA.ResultInstOperandIndex[AliasOpNo].first == i; ++AliasOpNo) {
+    for (; AliasOpNo < LastOpNo &&
+           CGA.ResultInstOperandIndex[AliasOpNo].first == i;
+         ++AliasOpNo) {
       int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second;
 
       // Find out what operand from the asmparser that this MCInst operand
@@ -1897,17 +1903,18 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) {
         StringRef Name = CGA.ResultOperands[AliasOpNo].getName();
         int SrcOperand = findAsmOperand(Name, SubIdx);
         if (SrcOperand == -1)
-          PrintFatalError(TheDef->getLoc(), "Instruction '" +
-                        TheDef->getName() + "' has operand '" + OpName +
-                        "' that doesn't appear in asm string!");
+          PrintFatalError(TheDef->getLoc(),
+                          "Instruction '" + TheDef->getName() +
+                              "' has operand '" + OpName +
+                              "' that doesn't appear in asm string!");
 
         // Add it to the operand references. If it is added a second time, the
         // record won't be updated and it will fail later on.
         OperandRefs.try_emplace(Name, SrcOperand);
 
         unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1);
-        ResOperands.push_back(ResOperand::getRenderedOp(SrcOperand,
-                                                        NumOperands));
+        ResOperands.push_back(
+            ResOperand::getRenderedOp(SrcOperand, NumOperands));
         break;
       }
       case CodeGenInstAlias::ResultOperand::K_Imm: {
@@ -1952,7 +1959,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
                  raw_ostream &OS) {
   SmallSetVector<CachedHashString, 16> OperandConversionKinds;
   SmallSetVector<CachedHashString, 16> InstructionConversionKinds;
-  std::vector<std::vector<uint8_t> > ConversionTable;
+  std::vector<std::vector<uint8_t>> ConversionTable;
   size_t MaxRowLength = 2; // minimum is custom converter plus terminator.
 
   // TargetOperandClass - This is the target's operand class, like X86Operand.
@@ -2009,7 +2016,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "      break;\n";
   CvtOS << "    case CVT_Tied: {\n";
   CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
-  CvtOS << "                              std::begin(TiedAsmOperandTable)) &&\n";
+  CvtOS
+      << "                              std::begin(TiedAsmOperandTable)) &&\n";
   CvtOS << "             \"Tied operand not found\");\n";
   CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
   CvtOS << "      if (TiedResOpnd != (uint8_t)-1)\n";
@@ -2048,7 +2056,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
   // Map of e.g. <0, 2, 3> -> "Tie_0_2_3" enum label.
   std::map<std::tuple<uint8_t, uint8_t, uint8_t>, std::string>
-  TiedOperandsEnumMap;
+      TiedOperandsEnumMap;
 
   for (auto &II : Infos) {
     // Check if we have a custom match function.
@@ -2073,8 +2081,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       ConversionTable.back().push_back(CVT_Done);
 
       // Add the handler to the conversion driver function.
-      CvtOS << "    case CVT_"
-            << getEnumNameForToken(AsmMatchConverter) << ":\n"
+      CvtOS << "    case CVT_" << getEnumNameForToken(AsmMatchConverter)
+            << ":\n"
             << "      " << AsmMatchConverter << "(Inst, Operands);\n"
             << "      break;\n";
 
@@ -2088,7 +2096,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     std::vector<uint8_t> ConversionRow;
 
     // Compute the convert enum and the case body.
-    MaxRowLength = std::max(MaxRowLength, II->ResOperands.size()*2 + 1 );
+    MaxRowLength = std::max(MaxRowLength, II->ResOperands.size() * 2 + 1);
 
     for (unsigned i = 0, e = II->ResOperands.size(); i != e; ++i) {
       const MatchableInfo::ResOperand &OpInfo = II->ResOperands[i];
@@ -2098,7 +2106,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       case MatchableInfo::ResOperand::RenderAsmOperand: {
         // This comes from something we parsed.
         const MatchableInfo::AsmOperand &Op =
-          II->AsmOperands[OpInfo.AsmOperandNum];
+            II->AsmOperands[OpInfo.AsmOperandNum];
 
         // Registers are always converted the same, don't duplicate the
         // conversion function based on them.
@@ -2111,8 +2119,9 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
         // Add the conversion kind, if necessary, and get the associated ID
         // the index of its entry in the vector).
-        std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" :
-                                     Op.Class->RenderMethod);
+        std::string Name =
+            "CVT_" +
+            (Op.Class->isRegisterClass() ? "Reg" : Op.Class->RenderMethod);
         if (Op.Class->IsOptional) {
           // For optional operands we must also care about DefaultMethod
           assert(HasOptionalOperands);
@@ -2121,8 +2130,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         Name = getEnumNameForToken(Name);
 
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
 
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
@@ -2171,10 +2180,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // operand from the earlier one.We can only tie single MCOperand values.
         assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
         uint8_t TiedOp = OpInfo.TiedOperands.ResOpnd;
-        uint8_t SrcOp1 =
-            OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst;
-        uint8_t SrcOp2 =
-            OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst;
+        uint8_t SrcOp1 = OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst;
+        uint8_t SrcOp2 = OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst;
         assert((i > TiedOp || TiedOp == (uint8_t)-1) &&
                "Tied operand precedes its target!");
         auto TiedTupleName = std::string("Tie") + utostr(TiedOp) + '_' +
@@ -2198,8 +2205,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
         std::string Name = "CVT_" + Ty;
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
         ConversionRow.push_back(0);
@@ -2230,8 +2237,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         Signature += "__" + Name;
         Name = "CVT_" + Name;
         bool IsNewConverter = false;
-        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
-                                            IsNewConverter);
+        unsigned ID =
+            getConverterOperandID(Name, OperandConversionKinds, IsNewConverter);
         // Add the operand entry to the instruction kind conversion row.
         ConversionRow.push_back(ID);
         ConversionRow.push_back(0);
@@ -2289,9 +2296,8 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
     OS << "static const uint8_t TiedAsmOperandTable[][3] = {\n";
     for (auto &KV : TiedOperandsEnumMap) {
-      OS << "  /* " << KV.second << " */ { "
-         << utostr(std::get<0>(KV.first)) << ", "
-         << utostr(std::get<1>(KV.first)) << ", "
+      OS << "  /* " << KV.second << " */ { " << utostr(std::get<0>(KV.first))
+         << ", " << utostr(std::get<1>(KV.first)) << ", "
          << utostr(std::get<2>(KV.first)) << " },\n";
     }
     OS << "};\n\n";
@@ -2402,7 +2408,8 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
 
 /// emitMatchClassDiagStrings - Emit a function to get the diagnostic text to be
 /// used when an assembly operand does not match the expected operand class.
-static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &OS) {
+static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info,
+                                             raw_ostream &OS) {
   // If the target does not use DiagnosticString for any operands, don't emit
   // an unused function.
   if (llvm::all_of(Info.Classes, [](const ClassInfo &CI) {
@@ -2415,12 +2422,12 @@ static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &
      << "MatchResultTy MatchResult) {\n";
   OS << "  switch (MatchResult) {\n";
 
-  for (const auto &CI: Info.Classes) {
+  for (const auto &CI : Info.Classes) {
     if (!CI.DiagnosticString.empty()) {
       assert(!CI.DiagnosticType.empty() &&
              "DiagnosticString set without DiagnosticType");
-      OS << "  case " << Info.Target.getName()
-         << "AsmParser::Match_" << CI.DiagnosticType << ":\n";
+      OS << "  case " << Info.Target.getName() << "AsmParser::Match_"
+         << CI.DiagnosticType << ":\n";
       OS << "    return \"" << CI.DiagnosticString << "\";\n";
     }
   }
@@ -2441,7 +2448,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
     OS << "  return MCTargetAsmParser::Match_InvalidOperand;\n";
   } else {
     OS << "  switch (RegisterClass) {\n";
-    for (const auto &CI: Info.Classes) {
+    for (const auto &CI : Info.Classes) {
       if (CI.isRegisterClass() && !CI.DiagnosticType.empty()) {
         OS << "  case " << CI.Name << ":\n";
         OS << "    return " << Info.Target.getName() << "AsmParser::Match_"
@@ -2458,8 +2465,7 @@ static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
 }
 
 /// emitValidateOperandClass - Emit the function to validate an operand class.
-static void emitValidateOperandClass(AsmMatcherInfo &Info,
-                                     raw_ostream &OS) {
+static void emitValidateOperandClass(AsmMatcherInfo &Info, raw_ostream &OS) {
   OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, "
      << "MatchClassKind Kind) {\n";
   OS << "  " << Info.Target.getName() << "Operand &Operand = ("
@@ -2495,8 +2501,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
       OS << "      return " << Info.Target.getName() << "AsmParser::Match_"
          << CI.DiagnosticType << ";\n";
       OS << "    break;\n";
-    }
-    else
+    } else
       OS << "    break;\n";
     OS << "    }\n";
   }
@@ -2508,8 +2513,8 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "    switch (Operand.getReg()) {\n";
   OS << "    default: OpKind = InvalidMatchClass; break;\n";
   for (const auto &RC : Info.RegisterClasses)
-    OS << "    case " << RC.first->getValueAsString("Namespace") << "::"
-       << RC.first->getName() << ": OpKind = " << RC.second->Name
+    OS << "    case " << RC.first->getValueAsString("Namespace")
+       << "::" << RC.first->getName() << ": OpKind = " << RC.second->Name
        << "; break;\n";
   OS << "    }\n";
   OS << "    return isSubclass(OpKind, Kind) ? "
@@ -2676,7 +2681,8 @@ static void emitOperandDiagnosticTypes(AsmMatcherInfo &Info, raw_ostream &OS) {
       Types.insert(OpClassEntry.second->DiagnosticType);
   }
 
-  if (Types.empty()) return;
+  if (Types.empty())
+    return;
 
   // Now emit the enum entries.
   for (StringRef Type : Types)
@@ -2709,7 +2715,7 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
 
 static std::string GetAliasRequiredFeatures(Record *R,
                                             const AsmMatcherInfo &Info) {
-  std::vector<Record*> ReqFeatures = R->getValueAsListOfDefs("Predicates");
+  std::vector<Record *> ReqFeatures = R->getValueAsListOfDefs("Predicates");
   std::string Result;
 
   if (ReqFeatures.empty())
@@ -2719,8 +2725,9 @@ static std::string GetAliasRequiredFeatures(Record *R,
     const SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
 
     if (!F)
-      PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
-                    "' is not marked as an AssemblerPredicate!");
+      PrintFatalError(R->getLoc(),
+                      "Predicate '" + ReqFeatures[i]->getName() +
+                          "' is not marked as an AssemblerPredicate!");
 
     if (i)
       Result += " && ";
@@ -2731,21 +2738,21 @@ static std::string GetAliasRequiredFeatures(Record *R,
   return Result;
 }
 
-static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info,
-                                     std::vector<Record*> &Aliases,
-                                     unsigned Indent = 0,
-                                  StringRef AsmParserVariantName = StringRef()){
+static void
+emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info,
+                         std::vector<Record *> &Aliases, unsigned Indent = 0,
+                         StringRef AsmParserVariantName = StringRef()) {
   // Keep track of all the aliases from a mnemonic.  Use an std::map so that the
   // iteration order of the map is stable.
-  std::map<std::string, std::vector<Record*> > AliasesFromMnemonic;
+  std::map<std::string, std::vector<Record *>> AliasesFromMnemonic;
 
   for (Record *R : Aliases) {
     // FIXME: Allow AssemblerVariantName to be a comma separated list.
     StringRef AsmVariantName = R->getValueAsString("AsmVariantName");
     if (AsmVariantName != AsmParserVariantName)
       continue;
-    AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()]
-        .push_back(R);
+    AliasesFromMnemonic[R->getValueAsString("FromMnemonic").lower()].push_back(
+        R);
   }
   if (AliasesFromMnemonic.empty())
     return;
@@ -2754,7 +2761,7 @@ static void emitMnemonicAliasVariant(raw_ostream &OS,const AsmMatcherInfo &Info,
   // by the string remapper.
   std::vector<StringMatcher::StringPair> Cases;
   for (const auto &AliasEntry : AliasesFromMnemonic) {
-    const std::vector<Record*> &ToVec = AliasEntry.second;
+    const std::vector<Record *> &ToVec = AliasEntry.second;
 
     // Loop through each alias and emit code that handles each case.  If there
     // are two instructions without predicates, emit an error.  If there is one,
@@ -2818,12 +2825,13 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info,
   if (!MatchPrefix.empty())
     return false;
 
-  std::vector<Record*> Aliases =
-    Info.getRecords().getAllDerivedDefinitions("MnemonicAlias");
-  if (Aliases.empty()) return false;
+  std::vector<Record *> Aliases =
+      Info.getRecords().getAllDerivedDefinitions("MnemonicAlias");
+  if (Aliases.empty())
+    return false;
 
   OS << "static void applyMnemonicAliases(StringRef &Mnemonic, "
-    "const FeatureBitset &Features, unsigned VariantID) {\n";
+        "const FeatureBitset &Features, unsigned VariantID) {\n";
   OS << "  switch (VariantID) {\n";
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
@@ -2859,17 +2867,15 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
-               << " Mnemonic;\n";
-  OS << "    " << getMinimalTypeForRange(MaxMask)
-               << " OperandMask;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMask) << " OperandMask;\n";
   OS << "    "
      << getMinimalTypeForRange(
             std::distance(Info.Classes.begin(), Info.Classes.end()) +
             2 /* Include 'InvalidMatchClass' and 'OptionalMatchClass' */)
      << " Class;\n";
   OS << "    " << getMinimalTypeForRange(MaxFeaturesIndex)
-               << " RequiredFeaturesIdx;\n\n";
+     << " RequiredFeaturesIdx;\n\n";
   OS << "    StringRef getMnemonic() const {\n";
   OS << "      return StringRef(MnemonicTable + Mnemonic + 1,\n";
   OS << "                       MnemonicTable[Mnemonic]);\n";
@@ -2903,13 +2909,13 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
     // Store a pascal-style length byte in the mnemonic.
     std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.lower();
-    OS << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-       << " /* " << II.Mnemonic << " */, ";
+    OS << StringTable.GetOrAddStringOffset(LenMnemonic, false) << " /* "
+       << II.Mnemonic << " */, ";
 
     OS << OMI.OperandMask;
     OS << " /* ";
     ListSeparator LS;
-    for (int i = 0, e = 31; i !=e; ++i)
+    for (int i = 0, e = 31; i != e; ++i)
       if (OMI.OperandMask & (1 << i))
         OS << LS << i;
     OS << " */, ";
@@ -2958,7 +2964,8 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  const FeatureBitset &AvailableFeatures = "
+        "getAvailableFeatures();\n\n";
 
   OS << "  // Get the next operand index.\n";
   OS << "  unsigned NextOpNum = Operands.size()"
@@ -3064,7 +3071,7 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
      << "MnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,"
      << " unsigned VariantID) {\n";
   if (!VariantCount)
-    OS <<  "  return \"\";";
+    OS << "  return \"\";";
   else {
     OS << "  const unsigned MaxEditDist = 2;\n";
     OS << "  std::vector<StringRef> Candidates;\n";
@@ -3112,10 +3119,8 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
   OS << "\n";
 }
 
-static void emitMnemonicChecker(raw_ostream &OS,
-                                CodeGenTarget &Target,
-                                unsigned VariantCount,
-                                bool HasMnemonicFirst,
+static void emitMnemonicChecker(raw_ostream &OS, CodeGenTarget &Target,
+                                unsigned VariantCount, bool HasMnemonicFirst,
                                 bool HasMnemonicAliases) {
   OS << "static bool " << Target.getName()
      << "CheckMnemonic(StringRef Mnemonic,\n";
@@ -3125,7 +3130,7 @@ static void emitMnemonicChecker(raw_ostream &OS,
      << "unsigned VariantID) {\n";
 
   if (!VariantCount) {
-    OS <<  "  return false;\n";
+    OS << "  return false;\n";
   } else {
     if (HasMnemonicAliases) {
       OS << "  // Process all MnemonicAliases to remap the mnemonic.\n";
@@ -3232,9 +3237,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 #endif
 
   DEBUG_WITH_TYPE("instruction_info", {
-      for (const auto &MI : Info.Matchables)
-        MI->dump();
-    });
+    for (const auto &MI : Info.Matchables)
+      MI->dump();
+  });
 
   // Check for ambiguous matchables.
   DEBUG_WITH_TYPE("ambiguous_instrs", {
@@ -3256,8 +3261,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       }
     }
     if (NumAmbiguous)
-      errs() << "warning: " << NumAmbiguous
-             << " ambiguous matchables!\n";
+      errs() << "warning: " << NumAmbiguous << " ambiguous matchables!\n";
   });
 
   // Compute the information on the custom operand parsing.
@@ -3275,12 +3279,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "#undef GET_ASSEMBLER_HEADER\n";
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
-  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) const;\n";
+  OS << "  FeatureBitset ComputeAvailableFeatures(const FeatureBitset &FB) "
+        "const;\n";
   if (HasOptionalOperands) {
     OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
        << "unsigned Opcode,\n"
        << "                       const OperandVector &Operands,\n"
-       << "                       const SmallBitVector &OptionalOperandsMask);\n";
+       << "                       const SmallBitVector "
+          "&OptionalOperandsMask);\n";
   } else {
     OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
        << "unsigned Opcode,\n"
@@ -3291,7 +3297,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  unsigned MatchInstructionImpl(const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n";
   if (ReportMultipleNearMisses)
-    OS << "                                SmallVectorImpl<NearMissInfo> *NearMisses,\n";
+    OS << "                                SmallVectorImpl<NearMissInfo> "
+          "*NearMisses,\n";
   else
     OS << "                                uint64_t &ErrorInfo,\n"
        << "                                FeatureBitset &MissingFeatures,\n";
@@ -3304,11 +3311,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
        << "                                bool matchingInlineAsm,\n"
        << "                                unsigned VariantID = 0) {\n"
        << "    FeatureBitset MissingFeatures;\n"
-       << "    return MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,\n"
+       << "    return MatchInstructionImpl(Operands, Inst, ErrorInfo, "
+          "MissingFeatures,\n"
        << "                                matchingInlineAsm, VariantID);\n"
        << "  }\n\n";
 
-
   if (!Info.OperandMatchInfo.empty()) {
     OS << "  ParseStatus MatchOperandParserImpl(\n";
     OS << "    OperandVector &Operands,\n";
@@ -3362,9 +3369,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // Generate the convertToMCInst function to convert operands into an MCInst.
   // Also, generate the convertToMapAndConstraints function for MS-style inline
   // assembly.  The latter doesn't actually generate a MCInst.
-  unsigned NumConverters = emitConvertFuncs(Target, ClassName, Info.Matchables,
-                                            HasMnemonicFirst,
-                                            HasOptionalOperands, OS);
+  unsigned NumConverters =
+      emitConvertFuncs(Target, ClassName, Info.Matchables, HasMnemonicFirst,
+                       HasOptionalOperands, OS);
 
   // Emit the enumeration for classes which participate in matching.
   emitMatchClassEnumeration(Target, Info.Classes, OS);
@@ -3406,8 +3413,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
     // Store a pascal-style length byte in the mnemonic.
     std::string LenMnemonic = char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
-    MaxMnemonicIndex = std::max(MaxMnemonicIndex,
-                        StringTable.GetOrAddStringOffset(LenMnemonic, false));
+    MaxMnemonicIndex = std::max(
+        MaxMnemonicIndex, StringTable.GetOrAddStringOffset(LenMnemonic, false));
   }
 
   OS << "static const char MnemonicTable[] =\n";
@@ -3476,13 +3483,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // following the mnemonic.
   OS << "namespace {\n";
   OS << "  struct MatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
-               << " Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex) << " Mnemonic;\n";
   OS << "    uint16_t Opcode;\n";
-  OS << "    " << getMinimalTypeForRange(NumConverters)
-               << " ConvertFn;\n";
+  OS << "    " << getMinimalTypeForRange(NumConverters) << " ConvertFn;\n";
   OS << "    " << getMinimalTypeForRange(FeatureBitsets.size())
-               << " RequiredFeaturesIdx;\n";
+     << " RequiredFeaturesIdx;\n";
   OS << "    "
      << getMinimalTypeForRange(
             std::distance(Info.Classes.begin(), Info.Classes.end()) +
@@ -3524,9 +3529,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
       std::string LenMnemonic =
           char(MI->Mnemonic.size()) + MI->Mnemonic.lower();
       OS << "  { " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-         << " /* " << MI->Mnemonic << " */, "
-         << Target.getInstNamespace() << "::"
-         << MI->getResultInst()->TheDef->getName() << ", "
+         << " /* " << MI->Mnemonic << " */, " << Target.getInstNamespace()
+         << "::" << MI->getResultInst()->TheDef->getName() << ", "
          << MI->ConversionFnKind << ", ";
 
       // Write the required features mask.
@@ -3563,17 +3567,17 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   if (!ReportMultipleNearMisses) {
     OS << "  // Eliminate obvious mismatches.\n";
-    OS << "  if (Operands.size() > "
-       << (MaxNumOperands + HasMnemonicFirst) << ") {\n";
-    OS << "    ErrorInfo = "
-       << (MaxNumOperands + HasMnemonicFirst) << ";\n";
+    OS << "  if (Operands.size() > " << (MaxNumOperands + HasMnemonicFirst)
+       << ") {\n";
+    OS << "    ErrorInfo = " << (MaxNumOperands + HasMnemonicFirst) << ";\n";
     OS << "    return Match_InvalidOperand;\n";
     OS << "  }\n\n";
   }
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  const FeatureBitset &AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  const FeatureBitset &AvailableFeatures = "
+        "getAvailableFeatures();\n\n";
 
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   if (HasMnemonicFirst) {
@@ -3632,7 +3636,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
           "std::equal_range(Start, End, Mnemonic.lower(), LessOpcode());\n\n";
   }
 
-  OS << "  DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" <<\n"
+  OS << "  DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"AsmMatcher: found \" "
+        "<<\n"
      << "  std::distance(MnemonicRange.first, MnemonicRange.second) <<\n"
      << "  \" encodings with mnemonic '\" << Mnemonic << \"'\\n\");\n\n";
 
@@ -3647,15 +3652,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         "FeatureBitsets[it->RequiredFeaturesIdx];\n";
   OS << "    bool HasRequiredFeatures =\n";
   OS << "      (AvailableFeatures & RequiredFeatures) == RequiredFeatures;\n";
-  OS << "    DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match opcode \"\n";
-  OS << "                                          << MII.getName(it->Opcode) << \"\\n\");\n";
+  OS << "    DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Trying to match "
+        "opcode \"\n";
+  OS << "                                          << MII.getName(it->Opcode) "
+        "<< \"\\n\");\n";
 
   if (ReportMultipleNearMisses) {
-    OS << "    // Some state to record ways in which this instruction did not match.\n";
+    OS << "    // Some state to record ways in which this instruction did not "
+          "match.\n";
     OS << "    NearMissInfo OperandNearMiss = NearMissInfo::getSuccess();\n";
     OS << "    NearMissInfo FeaturesNearMiss = NearMissInfo::getSuccess();\n";
-    OS << "    NearMissInfo EarlyPredicateNearMiss = NearMissInfo::getSuccess();\n";
-    OS << "    NearMissInfo LatePredicateNearMiss = NearMissInfo::getSuccess();\n";
+    OS << "    NearMissInfo EarlyPredicateNearMiss = "
+          "NearMissInfo::getSuccess();\n";
+    OS << "    NearMissInfo LatePredicateNearMiss = "
+          "NearMissInfo::getSuccess();\n";
     OS << "    bool MultipleInvalidOperands = false;\n";
   }
 
@@ -3676,30 +3686,39 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      auto Formal = "
      << "static_cast<MatchClassKind>(it->Classes[FormalIdx]);\n";
   OS << "      DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                      dbgs() << \"  Matching formal operand class \" << getMatchClassName(Formal)\n";
-  OS << "                             << \" against actual operand at index \" << ActualIdx);\n";
+  OS << "                      dbgs() << \"  Matching formal operand class \" "
+        "<< getMatchClassName(Formal)\n";
+  OS << "                             << \" against actual operand at index \" "
+        "<< ActualIdx);\n";
   OS << "      if (ActualIdx < Operands.size())\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \" (\";\n";
-  OS << "                        Operands[ActualIdx]->print(dbgs()); dbgs() << \"): \");\n";
+  OS << "                        Operands[ActualIdx]->print(dbgs()); dbgs() << "
+        "\"): \");\n";
   OS << "      else\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \": \");\n";
   OS << "      if (ActualIdx >= Operands.size()) {\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"actual operand "
         "index out of range\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "        bool ThisOperandValid = (Formal == " <<"InvalidMatchClass) || "
-                                   "isSubclass(Formal, OptionalMatchClass);\n";
+    OS << "        bool ThisOperandValid = (Formal == "
+       << "InvalidMatchClass) || "
+          "isSubclass(Formal, OptionalMatchClass);\n";
     OS << "        if (!ThisOperandValid) {\n";
     OS << "          if (!OperandNearMiss) {\n";
     OS << "            // Record info about match failure for later use.\n";
-    OS << "            DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording too-few-operands near miss\\n\");\n";
+    OS << "            DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"recording "
+          "too-few-operands near miss\\n\");\n";
     OS << "            OperandNearMiss =\n";
-    OS << "                NearMissInfo::getTooFewOperands(Formal, it->Opcode);\n";
-    OS << "          } else if (OperandNearMiss.getKind() != NearMissInfo::NearMissTooFewOperands) {\n";
-    OS << "            // If more than one operand is invalid, give up on this match entry.\n";
+    OS << "                NearMissInfo::getTooFewOperands(Formal, "
+          "it->Opcode);\n";
+    OS << "          } else if (OperandNearMiss.getKind() != "
+          "NearMissInfo::NearMissTooFewOperands) {\n";
+    OS << "            // If more than one operand is invalid, give up on this "
+          "match entry.\n";
     OS << "            DEBUG_WITH_TYPE(\n";
     OS << "                \"asm-matcher\",\n";
-    OS << "                dbgs() << \"second invalid operand, giving up on this opcode\\n\");\n";
+    OS << "                dbgs() << \"second invalid operand, giving up on "
+          "this opcode\\n\");\n";
     OS << "            MultipleInvalidOperands = true;\n";
     OS << "            break;\n";
     OS << "          }\n";
@@ -3731,17 +3750,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      unsigned Diag = validateOperandClass(Actual, Formal);\n";
   OS << "      if (Diag == Match_Success) {\n";
   OS << "        DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                        dbgs() << \"match success using generic matcher\\n\");\n";
+  OS << "                        dbgs() << \"match success using generic "
+        "matcher\\n\");\n";
   OS << "        ++ActualIdx;\n";
   OS << "        continue;\n";
   OS << "      }\n";
   OS << "      // If the generic handler indicates an invalid operand\n";
   OS << "      // failure, check for a special case.\n";
   OS << "      if (Diag != Match_Success) {\n";
-  OS << "        unsigned TargetDiag = validateTargetOperandClass(Actual, Formal);\n";
+  OS << "        unsigned TargetDiag = validateTargetOperandClass(Actual, "
+        "Formal);\n";
   OS << "        if (TargetDiag == Match_Success) {\n";
   OS << "          DEBUG_WITH_TYPE(\"asm-matcher\",\n";
-  OS << "                          dbgs() << \"match success using target matcher\\n\");\n";
+  OS << "                          dbgs() << \"match success using target "
+        "matcher\\n\");\n";
   OS << "          ++ActualIdx;\n";
   OS << "          continue;\n";
   OS << "        }\n";
@@ -3758,38 +3780,46 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   if (HasOptionalOperands) {
     OS << "        OptionalOperandsMask.set(FormalIdx);\n";
   }
-    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring optional operand\\n\");\n";
+  OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"ignoring "
+        "optional operand\\n\");\n";
   OS << "        continue;\n";
   OS << "      }\n";
 
   if (ReportMultipleNearMisses) {
     OS << "      if (!OperandNearMiss) {\n";
-    OS << "        // If this is the first invalid operand we have seen, record some\n";
+    OS << "        // If this is the first invalid operand we have seen, "
+          "record some\n";
     OS << "        // information about it.\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
     OS << "            dbgs()\n";
-    OS << "                << \"operand match failed, recording near-miss with diag code \"\n";
+    OS << "                << \"operand match failed, recording near-miss with "
+          "diag code \"\n";
     OS << "                << Diag << \"\\n\");\n";
     OS << "        OperandNearMiss =\n";
-    OS << "            NearMissInfo::getMissedOperand(Diag, Formal, it->Opcode, ActualIdx);\n";
+    OS << "            NearMissInfo::getMissedOperand(Diag, Formal, "
+          "it->Opcode, ActualIdx);\n";
     OS << "        ++ActualIdx;\n";
     OS << "      } else {\n";
-    OS << "        // If more than one operand is invalid, give up on this match entry.\n";
+    OS << "        // If more than one operand is invalid, give up on this "
+          "match entry.\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
-    OS << "            dbgs() << \"second operand mismatch, skipping this opcode\\n\");\n";
+    OS << "            dbgs() << \"second operand mismatch, skipping this "
+          "opcode\\n\");\n";
     OS << "        MultipleInvalidOperands = true;\n";
     OS << "        break;\n";
     OS << "      }\n";
     OS << "    }\n\n";
   } else {
-    OS << "      // If this operand is broken for all of the instances of this\n";
+    OS << "      // If this operand is broken for all of the instances of "
+          "this\n";
     OS << "      // mnemonic, keep track of it so we can report loc info.\n";
     OS << "      // If we already had a match that only failed due to a\n";
     OS << "      // target predicate, that diagnostic is preferred.\n";
     OS << "      if (!HadMatchOtherThanPredicate &&\n";
-    OS << "          (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) {\n";
+    OS << "          (it == MnemonicRange.first || ErrorInfo <= ActualIdx)) "
+          "{\n";
     OS << "        if (HasRequiredFeatures && (ErrorInfo != ActualIdx || Diag "
           "!= Match_InvalidOperand))\n";
     OS << "          RetCode = Diag;\n";
@@ -3805,8 +3835,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "    if (MultipleInvalidOperands) {\n";
   else
     OS << "    if (!OperandsValid) {\n";
-  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-  OS << "                                               \"operand mismatches, ignoring \"\n";
+  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+        "multiple \"\n";
+  OS << "                                               \"operand mismatches, "
+        "ignoring \"\n";
   OS << "                                               \"this opcode\\n\");\n";
   OS << "      continue;\n";
   OS << "    }\n";
@@ -3817,13 +3849,16 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "      HadMatchOtherThanFeatures = true;\n";
   OS << "      FeatureBitset NewMissingFeatures = RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target features:\";\n";
-  OS << "                      for (unsigned I = 0, E = NewMissingFeatures.size(); I != E; ++I)\n";
+  OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Missing target "
+        "features:\";\n";
+  OS << "                      for (unsigned I = 0, E = "
+        "NewMissingFeatures.size(); I != E; ++I)\n";
   OS << "                        if (NewMissingFeatures[I])\n";
   OS << "                          dbgs() << ' ' << I;\n";
   OS << "                      dbgs() << \"\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "      FeaturesNearMiss = NearMissInfo::getMissedFeature(NewMissingFeatures);\n";
+    OS << "      FeaturesNearMiss = "
+          "NearMissInfo::getMissedFeature(NewMissingFeatures);\n";
   } else {
     OS << "      if (NewMissingFeatures.count() <=\n"
           "          MissingFeatures.count())\n";
@@ -3848,10 +3883,12 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << "      Inst.clear();\n";
   OS << "      DEBUG_WITH_TYPE(\n";
   OS << "          \"asm-matcher\",\n";
-  OS << "          dbgs() << \"Early target match predicate failed with diag code \"\n";
+  OS << "          dbgs() << \"Early target match predicate failed with diag "
+        "code \"\n";
   OS << "                 << MatchResult << \"\\n\");\n";
   if (ReportMultipleNearMisses) {
-    OS << "      EarlyPredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n";
+    OS << "      EarlyPredicateNearMiss = "
+          "NearMissInfo::getMissedPredicate(MatchResult);\n";
   } else {
     OS << "      RetCode = MatchResult;\n"
        << "      HadMatchOtherThanPredicate = true;\n"
@@ -3860,20 +3897,27 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    }\n\n";
 
   if (ReportMultipleNearMisses) {
-    OS << "    // If we did not successfully match the operands, then we can't convert to\n";
+    OS << "    // If we did not successfully match the operands, then we can't "
+          "convert to\n";
     OS << "    // an MCInst, so bail out on this instruction variant now.\n";
     OS << "    if (OperandNearMiss) {\n";
-    OS << "      // If the operand mismatch was the only problem, reprrt it as a near-miss.\n";
-    OS << "      if (NearMisses && !FeaturesNearMiss && !EarlyPredicateNearMiss) {\n";
+    OS << "      // If the operand mismatch was the only problem, reprrt it as "
+          "a near-miss.\n";
+    OS << "      if (NearMisses && !FeaturesNearMiss && "
+          "!EarlyPredicateNearMiss) {\n";
     OS << "        DEBUG_WITH_TYPE(\n";
     OS << "            \"asm-matcher\",\n";
     OS << "            dbgs()\n";
-    OS << "                << \"Opcode result: one mismatched operand, adding near-miss\\n\");\n";
+    OS << "                << \"Opcode result: one mismatched operand, adding "
+          "near-miss\\n\");\n";
     OS << "        NearMisses->push_back(OperandNearMiss);\n";
     OS << "      } else {\n";
-    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-    OS << "                                                 \"types of mismatch, so not \"\n";
-    OS << "                                                 \"reporting near-miss\\n\");\n";
+    OS << "        DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "multiple \"\n";
+    OS << "                                                 \"types of "
+          "mismatch, so not \"\n";
+    OS << "                                                 \"reporting "
+          "near-miss\\n\");\n";
     OS << "      }\n";
     OS << "      continue;\n";
     OS << "    }\n\n";
@@ -3905,11 +3949,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << "    if ((MatchResult = checkTargetMatchPredicate(Inst)) !="
      << " Match_Success) {\n"
      << "      DEBUG_WITH_TYPE(\"asm-matcher\",\n"
-     << "                      dbgs() << \"Target match predicate failed with diag code \"\n"
+     << "                      dbgs() << \"Target match predicate failed with "
+        "diag code \"\n"
      << "                             << MatchResult << \"\\n\");\n"
      << "      Inst.clear();\n";
   if (ReportMultipleNearMisses) {
-    OS << "      LatePredicateNearMiss = NearMissInfo::getMissedPredicate(MatchResult);\n";
+    OS << "      LatePredicateNearMiss = "
+          "NearMissInfo::getMissedPredicate(MatchResult);\n";
   } else {
     OS << "      RetCode = MatchResult;\n"
        << "      HadMatchOtherThanPredicate = true;\n"
@@ -3923,10 +3969,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "                         (int)(bool)EarlyPredicateNearMiss +\n";
     OS << "                         (int)(bool)LatePredicateNearMiss);\n";
     OS << "    if (NumNearMisses == 1) {\n";
-    OS << "      // We had exactly one type of near-miss, so add that to the list.\n";
-    OS << "      assert(!OperandNearMiss && \"OperandNearMiss was handled earlier\");\n";
-    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: found one type of \"\n";
-    OS << "                                            \"mismatch, so reporting a \"\n";
+    OS << "      // We had exactly one type of near-miss, so add that to the "
+          "list.\n";
+    OS << "      assert(!OperandNearMiss && \"OperandNearMiss was handled "
+          "earlier\");\n";
+    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "found one type of \"\n";
+    OS << "                                            \"mismatch, so "
+          "reporting a \"\n";
     OS << "                                            \"near-miss\\n\");\n";
     OS << "      if (NearMisses && FeaturesNearMiss)\n";
     OS << "        NearMisses->push_back(FeaturesNearMiss);\n";
@@ -3937,10 +3987,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "\n";
     OS << "      continue;\n";
     OS << "    } else if (NumNearMisses > 1) {\n";
-    OS << "      // This instruction missed in more than one way, so ignore it.\n";
-    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: multiple \"\n";
-    OS << "                                               \"types of mismatch, so not \"\n";
-    OS << "                                               \"reporting near-miss\\n\");\n";
+    OS << "      // This instruction missed in more than one way, so ignore "
+          "it.\n";
+    OS << "      DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"Opcode result: "
+          "multiple \"\n";
+    OS << "                                               \"types of mismatch, "
+          "so not \"\n";
+    OS << "                                               \"reporting "
+          "near-miss\\n\");\n";
     OS << "      continue;\n";
     OS << "    }\n";
   }
@@ -3952,7 +4006,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
-    OS << "    if (!getParser().getTargetParser().getTargetOptions().MCNoDeprecatedWarn &&\n";
+    OS << "    if "
+          "(!getParser().getTargetParser().getTargetOptions()."
+          "MCNoDeprecatedWarn &&\n";
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand &)*Operands[0]).getStartLoc();\n";
@@ -3969,7 +4025,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "    DEBUG_WITH_TYPE(\n";
   OS << "        \"asm-matcher\",\n";
-  OS << "        dbgs() << \"Opcode result: complete match, selecting this opcode\\n\");\n";
+  OS << "        dbgs() << \"Opcode result: complete match, selecting this "
+        "opcode\\n\");\n";
   OS << "    return Match_Success;\n";
   OS << "  }\n\n";
 
@@ -4002,8 +4059,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_MNEMONIC_CHECKER\n";
   OS << "#undef GET_MNEMONIC_CHECKER\n\n";
 
-  emitMnemonicChecker(OS, Target, VariantCount,
-                      HasMnemonicFirst, HasMnemonicAliases);
+  emitMnemonicChecker(OS, Target, VariantCount, HasMnemonicFirst,
+                      HasMnemonicAliases);
 
   OS << "#endif // GET_MNEMONIC_CHECKER\n\n";
 }
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index e0cd5fad3254..c05991fdbb16 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -64,6 +64,7 @@ public:
   AsmWriterEmitter(RecordKeeper &R);
 
   void run(raw_ostream &o);
+
 private:
   void EmitGetMnemonic(
       raw_ostream &o,
@@ -84,9 +85,9 @@ private:
 
 } // end anonymous namespace
 
-static void PrintCases(std::vector<std::pair<std::string,
-                       AsmWriterOperand>> &OpsToPrint, raw_ostream &O,
-                       bool PassSubtarget) {
+static void
+PrintCases(std::vector<std::pair<std::string, AsmWriterOperand>> &OpsToPrint,
+           raw_ostream &O, bool PassSubtarget) {
   O << "    case " << OpsToPrint.back().first << ":";
   AsmWriterOperand TheOp = OpsToPrint.back().second;
   OpsToPrint.pop_back();
@@ -94,9 +95,9 @@ static void PrintCases(std::vector<std::pair<std::string,
   // Check to see if any other operands are identical in this list, and if so,
   // emit a case label for them.
   for (unsigned i = OpsToPrint.size(); i != 0; --i)
-    if (OpsToPrint[i-1].second == TheOp) {
-      O << "\n    case " << OpsToPrint[i-1].first << ":";
-      OpsToPrint.erase(OpsToPrint.begin()+i-1);
+    if (OpsToPrint[i - 1].second == TheOp) {
+      O << "\n    case " << OpsToPrint[i - 1].first << ":";
+      OpsToPrint.erase(OpsToPrint.begin() + i - 1);
     }
 
   // Finally, emit the code.
@@ -106,33 +107,33 @@ static void PrintCases(std::vector<std::pair<std::string,
 
 /// EmitInstructions - Emit the last instruction in the vector and any other
 /// instructions that are suitably similar to it.
-static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
-                             raw_ostream &O, bool PassSubtarget) {
+static void EmitInstructions(std::vector<AsmWriterInst> &Insts, raw_ostream &O,
+                             bool PassSubtarget) {
   AsmWriterInst FirstInst = Insts.back();
   Insts.pop_back();
 
   std::vector<AsmWriterInst> SimilarInsts;
   unsigned DifferingOperand = ~0;
   for (unsigned i = Insts.size(); i != 0; --i) {
-    unsigned DiffOp = Insts[i-1].MatchesAllButOneOp(FirstInst);
+    unsigned DiffOp = Insts[i - 1].MatchesAllButOneOp(FirstInst);
     if (DiffOp != ~1U) {
-      if (DifferingOperand == ~0U)  // First match!
+      if (DifferingOperand == ~0U) // First match!
         DifferingOperand = DiffOp;
 
       // If this differs in the same operand as the rest of the instructions in
       // this class, move it to the SimilarInsts list.
       if (DifferingOperand == DiffOp || DiffOp == ~0U) {
-        SimilarInsts.push_back(Insts[i-1]);
-        Insts.erase(Insts.begin()+i-1);
+        SimilarInsts.push_back(Insts[i - 1]);
+        Insts.erase(Insts.begin() + i - 1);
       }
     }
   }
 
-  O << "  case " << FirstInst.CGI->Namespace << "::"
-    << FirstInst.CGI->TheDef->getName() << ":\n";
+  O << "  case " << FirstInst.CGI->Namespace
+    << "::" << FirstInst.CGI->TheDef->getName() << ":\n";
   for (const AsmWriterInst &AWI : SimilarInsts)
-    O << "  case " << AWI.CGI->Namespace << "::"
-      << AWI.CGI->TheDef->getName() << ":\n";
+    O << "  case " << AWI.CGI->Namespace << "::" << AWI.CGI->TheDef->getName()
+      << ":\n";
   for (unsigned i = 0, e = FirstInst.Operands.size(); i != e; ++i) {
     if (i != DifferingOperand) {
       // If the operand is the same for all instructions, just print it.
@@ -143,14 +144,15 @@ static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
       O << "    switch (MI->getOpcode()) {\n";
       O << "    default: llvm_unreachable(\"Unexpected opcode.\");\n";
       std::vector<std::pair<std::string, AsmWriterOperand>> OpsToPrint;
-      OpsToPrint.push_back(std::make_pair(FirstInst.CGI->Namespace.str() + "::" +
-                                          FirstInst.CGI->TheDef->getName().str(),
-                                          FirstInst.Operands[i]));
+      OpsToPrint.push_back(
+          std::make_pair(FirstInst.CGI->Namespace.str() +
+                             "::" + FirstInst.CGI->TheDef->getName().str(),
+                         FirstInst.Operands[i]));
 
       for (const AsmWriterInst &AWI : SimilarInsts) {
-        OpsToPrint.push_back(std::make_pair(AWI.CGI->Namespace.str()+"::" +
-                                            AWI.CGI->TheDef->getName().str(),
-                                            AWI.Operands[i]));
+        OpsToPrint.push_back(std::make_pair(
+            AWI.CGI->Namespace.str() + "::" + AWI.CGI->TheDef->getName().str(),
+            AWI.Operands[i]));
       }
       std::reverse(OpsToPrint.begin(), OpsToPrint.end());
       while (!OpsToPrint.empty())
@@ -162,11 +164,10 @@ static void EmitInstructions(std::vector<AsmWriterInst> &Insts,
   O << "    break;\n";
 }
 
-void AsmWriterEmitter::
-FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
-                          std::vector<std::vector<unsigned>> &InstIdxs,
-                          std::vector<unsigned> &InstOpsUsed,
-                          bool PassSubtarget) const {
+void AsmWriterEmitter::FindUniqueOperandCommands(
+    std::vector<std::string> &UniqueOperandCommands,
+    std::vector<std::vector<unsigned>> &InstIdxs,
+    std::vector<unsigned> &InstOpsUsed, bool PassSubtarget) const {
   // This vector parallels UniqueOperandCommands, keeping track of which
   // instructions each case are used for.  It is a comma separated string of
   // enums.
@@ -177,9 +178,10 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   for (size_t i = 0, e = Instructions.size(); i != e; ++i) {
     const AsmWriterInst &Inst = Instructions[i];
     if (Inst.Operands.empty())
-      continue;   // Instruction already done.
+      continue; // Instruction already done.
 
-    std::string Command = "    "+Inst.Operands[0].getCode(PassSubtarget)+"\n";
+    std::string Command =
+        "    " + Inst.Operands[0].getCode(PassSubtarget) + "\n";
 
     // Check to see if we already have 'Command' in UniqueOperandCommands.
     // If not, add it.
@@ -203,12 +205,12 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   // For each entry of UniqueOperandCommands, there is a set of instructions
   // that uses it.  If the next command of all instructions in the set are
   // identical, fold it into the command.
-  for (size_t CommandIdx = 0, e = UniqueOperandCommands.size();
-       CommandIdx != e; ++CommandIdx) {
+  for (size_t CommandIdx = 0, e = UniqueOperandCommands.size(); CommandIdx != e;
+       ++CommandIdx) {
 
     const auto &Idxs = InstIdxs[CommandIdx];
 
-    for (unsigned Op = 1; ; ++Op) {
+    for (unsigned Op = 1;; ++Op) {
       // Find the first instruction in the set.
       const AsmWriterInst &FirstInst = Instructions[Idxs.front()];
       // If this instruction has no more operands, we isn't anything to merge
@@ -227,8 +229,8 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
 
       // Okay, everything in this command set has the same next operand.  Add it
       // to UniqueOperandCommands and remember that it was consumed.
-      std::string Command = "    " +
-        FirstInst.Operands[Op].getCode(PassSubtarget) + "\n";
+      std::string Command =
+          "    " + FirstInst.Operands[Op].getCode(PassSubtarget) + "\n";
 
       UniqueOperandCommands[CommandIdx] += Command;
       InstOpsUsed[CommandIdx]++;
@@ -239,35 +241,58 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
   for (unsigned i = 0, e = InstrsForCase.size(); i != e; ++i) {
     std::string Instrs = InstrsForCase[i];
     if (Instrs.size() > 70) {
-      Instrs.erase(Instrs.begin()+70, Instrs.end());
+      Instrs.erase(Instrs.begin() + 70, Instrs.end());
       Instrs += "...";
     }
 
     if (!Instrs.empty())
-      UniqueOperandCommands[i] = "    // " + Instrs + "\n" +
-        UniqueOperandCommands[i];
+      UniqueOperandCommands[i] =
+          "    // " + Instrs + "\n" + UniqueOperandCommands[i];
   }
 }
 
 static void UnescapeString(std::string &Str) {
   for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\' && i != Str.size()-1) {
-      switch (Str[i+1]) {
-      default: continue;  // Don't execute the code after the switch.
-      case 'a': Str[i] = '\a'; break;
-      case 'b': Str[i] = '\b'; break;
-      case 'e': Str[i] = 27; break;
-      case 'f': Str[i] = '\f'; break;
-      case 'n': Str[i] = '\n'; break;
-      case 'r': Str[i] = '\r'; break;
-      case 't': Str[i] = '\t'; break;
-      case 'v': Str[i] = '\v'; break;
-      case '"': Str[i] = '\"'; break;
-      case '\'': Str[i] = '\''; break;
-      case '\\': Str[i] = '\\'; break;
+    if (Str[i] == '\\' && i != Str.size() - 1) {
+      switch (Str[i + 1]) {
+      default:
+        continue; // Don't execute the code after the switch.
+      case 'a':
+        Str[i] = '\a';
+        break;
+      case 'b':
+        Str[i] = '\b';
+        break;
+      case 'e':
+        Str[i] = 27;
+        break;
+      case 'f':
+        Str[i] = '\f';
+        break;
+      case 'n':
+        Str[i] = '\n';
+        break;
+      case 'r':
+        Str[i] = '\r';
+        break;
+      case 't':
+        Str[i] = '\t';
+        break;
+      case 'v':
+        Str[i] = '\v';
+        break;
+      case '"':
+        Str[i] = '\"';
+        break;
+      case '\'':
+        Str[i] = '\'';
+        break;
+      case '\\':
+        Str[i] = '\\';
+        break;
       }
       // Nuke the second character.
-      Str.erase(Str.begin()+i+1);
+      Str.erase(Str.begin() + i + 1);
     }
   }
 }
@@ -281,14 +306,19 @@ static void UnescapeString(std::string &Str) {
 /// causes non-standard escape character warnings.
 static void UnescapeAliasString(std::string &Str) {
   for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\' && i != Str.size()-1) {
-      switch (Str[i+1]) {
-      default: continue;  // Don't execute the code after the switch.
-      case '{': Str[i] = '{'; break;
-      case '}': Str[i] = '}'; break;
+    if (Str[i] == '\\' && i != Str.size() - 1) {
+      switch (Str[i + 1]) {
+      default:
+        continue; // Don't execute the code after the switch.
+      case '{':
+        Str[i] = '{';
+        break;
+      case '}':
+        Str[i] = '}';
+        break;
       }
       // Nuke the second character.
-      Str.erase(Str.begin()+i+1);
+      Str.erase(Str.begin() + i + 1);
     }
   }
 }
@@ -318,8 +348,7 @@ void AsmWriterEmitter::EmitGetMnemonic(
   // Add all strings to the string table upfront so it can generate an optimized
   // representation.
   for (AsmWriterInst &AWI : Instructions) {
-    if (AWI.Operands[0].OperandType ==
-                 AsmWriterOperand::isLiteralTextOperand &&
+    if (AWI.Operands[0].OperandType == AsmWriterOperand::isLiteralTextOperand &&
         !AWI.Operands[0].Str.empty()) {
       std::string Str = AWI.Operands[0].Str;
       UnescapeString(Str);
@@ -347,7 +376,7 @@ void AsmWriterEmitter::EmitGetMnemonic(
     }
 
     // Bias offset by one since we want 0 as a sentinel.
-    OpcodeInfo[AWI.CGIIndex] = Idx+1;
+    OpcodeInfo[AWI.CGIIndex] = Idx + 1;
   }
 
   // Figure out how many bits we used for the string index.
@@ -365,7 +394,8 @@ void AsmWriterEmitter::EmitGetMnemonic(
                               NumInstOpsHandled, PassSubtarget);
 
     // If we ran out of operands to print, we're done.
-    if (UniqueOperandCommands.empty()) break;
+    if (UniqueOperandCommands.empty())
+      break;
 
     // Compute the number of bits we need to represent these cases, this is
     // ceil(log2(numentries)).
@@ -383,14 +413,14 @@ void AsmWriterEmitter::EmitGetMnemonic(
       unsigned NumOps = NumInstOpsHandled[i];
       for (unsigned Idx : InstIdxs[i]) {
         OpcodeInfo[Instructions[Idx].CGIIndex] |=
-          (uint64_t)i << (OpcodeInfoBits-BitsLeft);
+            (uint64_t)i << (OpcodeInfoBits - BitsLeft);
         // Remove the info about this operand from the instruction.
         AsmWriterInst &Inst = Instructions[Idx];
         if (!Inst.Operands.empty()) {
           assert(NumOps <= Inst.Operands.size() &&
                  "Can't remove this many ops!");
           Inst.Operands.erase(Inst.Operands.begin(),
-                              Inst.Operands.begin()+NumOps);
+                              Inst.Operands.begin() + NumOps);
         }
       }
     }
@@ -487,7 +517,7 @@ void AsmWriterEmitter::EmitPrintInstruction(
     << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n";
 
   // Output the table driven operand information.
-  BitsLeft = OpcodeInfoBits-AsmStrBits;
+  BitsLeft = OpcodeInfoBits - AsmStrBits;
   for (unsigned i = 0, e = TableDrivenOperandPrinters.size(); i != e; ++i) {
     std::vector<std::string> &Commands = TableDrivenOperandPrinters[i];
 
@@ -497,25 +527,21 @@ void AsmWriterEmitter::EmitPrintInstruction(
     assert(NumBits <= BitsLeft && "consistency error");
 
     // Emit code to extract this field from Bits.
-    O << "\n  // Fragment " << i << " encoded into " << NumBits
-      << " bits for " << Commands.size() << " unique commands.\n";
+    O << "\n  // Fragment " << i << " encoded into " << NumBits << " bits for "
+      << Commands.size() << " unique commands.\n";
 
     if (Commands.size() == 2) {
       // Emit two possibilitys with if/else.
-      O << "  if ((Bits >> "
-        << (OpcodeInfoBits-BitsLeft) << ") & "
-        << ((1 << NumBits)-1) << ") {\n"
-        << Commands[1]
-        << "  } else {\n"
-        << Commands[0]
-        << "  }\n\n";
+      O << "  if ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & "
+        << ((1 << NumBits) - 1) << ") {\n"
+        << Commands[1] << "  } else {\n"
+        << Commands[0] << "  }\n\n";
     } else if (Commands.size() == 1) {
       // Emit a single possibility.
       O << Commands[0] << "\n\n";
     } else {
-      O << "  switch ((Bits >> "
-        << (OpcodeInfoBits-BitsLeft) << ") & "
-        << ((1 << NumBits)-1) << ") {\n"
+      O << "  switch ((Bits >> " << (OpcodeInfoBits - BitsLeft) << ") & "
+        << ((1 << NumBits) - 1) << ") {\n"
         << "  default: llvm_unreachable(\"Invalid command number.\");\n";
 
       // Print out all the cases.
@@ -537,7 +563,6 @@ void AsmWriterEmitter::EmitPrintInstruction(
   // elements in the vector.
   std::reverse(Instructions.begin(), Instructions.end());
 
-
   // Now that we've emitted all of the operand info that fit into 64 bits, emit
   // information for those instructions that are left.  This is a less dense
   // encoding, but we expect the main 64-bit table to handle the majority of
@@ -572,22 +597,21 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
         AsmName = std::string(Reg.getName());
     } else {
       // Make sure the register has an alternate name for this index.
-      std::vector<Record*> AltNameList =
-        Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices");
+      std::vector<Record *> AltNameList =
+          Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices");
       unsigned Idx = 0, e;
       for (e = AltNameList.size();
-           Idx < e && (AltNameList[Idx]->getName() != AltName);
-           ++Idx)
+           Idx < e && (AltNameList[Idx]->getName() != AltName); ++Idx)
         ;
       // If the register has an alternate name for this index, use it.
       // Otherwise, leave it empty as an error flag.
       if (Idx < e) {
         std::vector<StringRef> AltNames =
-          Reg.TheDef->getValueAsListOfStrings("AltNames");
+            Reg.TheDef->getValueAsListOfStrings("AltNames");
         if (AltNames.size() <= Idx)
           PrintFatalError(Reg.TheDef->getLoc(),
                           "Register definition missing alt name for '" +
-                          AltName + "'.");
+                              AltName + "'.");
         AsmName = std::string(AltNames[Idx]);
       }
     }
@@ -613,15 +637,17 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
   Record *AsmWriter = Target.getAsmWriter();
   StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   const auto &Registers = Target.getRegBank().getRegisters();
-  const std::vector<Record*> &AltNameIndices = Target.getRegAltNameIndices();
+  const std::vector<Record *> &AltNameIndices = Target.getRegAltNameIndices();
   bool hasAltNames = AltNameIndices.size() > 1;
   StringRef Namespace = Registers.front().TheDef->getValueAsString("Namespace");
 
-  O <<
-  "\n\n/// getRegisterName - This method is automatically generated by tblgen\n"
-  "/// from the register set description.  This returns the assembler name\n"
-  "/// for the specified register.\n"
-  "const char *" << Target.getName() << ClassName << "::";
+  O << "\n\n/// getRegisterName - This method is automatically generated by "
+       "tblgen\n"
+       "/// from the register set description.  This returns the assembler "
+       "name\n"
+       "/// for the specified register.\n"
+       "const char *"
+    << Target.getName() << ClassName << "::";
   if (hasAltNames)
     O << "\ngetRegisterName(MCRegister Reg, unsigned AltIdx) {\n";
   else
@@ -695,8 +721,7 @@ public:
 
   void addOperand(StringRef Op, int OpIdx, int PrintMethodIdx = -1) {
     assert(OpIdx >= 0 && OpIdx < 0xFE && "Idx out of range");
-    assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF &&
-           "Idx out of range");
+    assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF && "Idx out of range");
     OpMap[Op] = std::make_pair(OpIdx, PrintMethodIdx);
   }
 
@@ -791,7 +816,7 @@ namespace {
 struct AliasPriorityComparator {
   typedef std::pair<CodeGenInstAlias, int> ValueType;
   bool operator()(const ValueType &LHS, const ValueType &RHS) const {
-    if (LHS.second ==  RHS.second) {
+    if (LHS.second == RHS.second) {
       // We don't actually care about the order, but for consistency it
       // shouldn't depend on pointer comparisons.
       return LessRecordByID()(LHS.first.TheDef, RHS.first.TheDef);
@@ -819,8 +844,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   unsigned Variant = AsmWriter->getValueAsInt("Variant");
   bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
 
-  std::vector<Record*> AllInstAliases =
-    Records.getAllDerivedDefinitions("InstAlias");
+  std::vector<Record *> AllInstAliases =
+      Records.getAllDerivedDefinitions("InstAlias");
 
   // Create a map from the qualified name to a list of potential matches.
   typedef std::set<std::pair<CodeGenInstAlias, int>, AliasPriorityComparator>
@@ -843,8 +868,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   std::vector<std::pair<std::string, bool>> PrintMethods;
 
   // A list of MCOperandPredicates for all operands in use, and the reverse map
-  std::vector<const Record*> MCOpPredicates;
-  DenseMap<const Record*, unsigned> MCOpPredicateMap;
+  std::vector<const Record *> MCOpPredicates;
+  DenseMap<const Record *, unsigned> MCOpPredicateMap;
 
   for (auto &Aliases : AliasMap) {
     // Collection of instruction alias rules. May contain ambiguous rules.
@@ -854,8 +879,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       const CodeGenInstAlias &CGA = Alias.first;
       unsigned LastOpNo = CGA.ResultInstOperandIndex.size();
       std::string FlatInstAsmString =
-         CodeGenInstruction::FlattenAsmStringVariants(CGA.ResultInst->AsmString,
-                                                      Variant);
+          CodeGenInstruction::FlattenAsmStringVariants(
+              CGA.ResultInst->AsmString, Variant);
       unsigned NumResultOps = CountNumOperands(FlatInstAsmString, Variant);
 
       std::string FlatAliasAsmString =
@@ -881,8 +906,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           unsigned OpNum = Operands.getSubOperandNumber(MIOpNum).first;
           if (Operands[OpNum].MINumOperands == 1 &&
               Operands[OpNum].getTiedRegister() != -1) {
-            // Tied operands of different RegisterClass should be explicit within
-            // an instruction's syntax and so cannot be skipped.
+            // Tied operands of different RegisterClass should be explicit
+            // within an instruction's syntax and so cannot be skipped.
             int TiedOpNum = Operands[OpNum].getTiedRegister();
             if (Operands[OpNum].Rec->getName() ==
                 Operands[TiedOpNum].Rec->getName()) {
@@ -1083,7 +1108,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     if (It == IAPrinterMap.end())
       continue;
     std::vector<IAPrinter> &IAPs = It->second;
-    std::vector<IAPrinter*> UniqueIAPs;
+    std::vector<IAPrinter *> UniqueIAPs;
 
     // Remove any ambiguous alias rules.
     for (auto &LHS : IAPs) {
@@ -1099,7 +1124,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         UniqueIAPs.push_back(&LHS);
     }
 
-    if (UniqueIAPs.empty()) continue;
+    if (UniqueIAPs.empty())
+      continue;
 
     unsigned PatternStart = PatternCount;
 
@@ -1193,7 +1219,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   if (MCOpPredicates.empty())
     O.indent(2) << "  nullptr,\n";
   else
-    O.indent(2) << "  &" << Target.getName() << ClassName << "ValidateMCOperand,\n";
+    O.indent(2) << "  &" << Target.getName() << ClassName
+                << "ValidateMCOperand,\n";
   O.indent(2) << "};\n";
 
   O.indent(2) << "const char *AsmString = matchAliasPatterns(MI, "
@@ -1262,21 +1289,22 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         << "    break;\n";
     }
     O << "  }\n";
-  }    
+  }
   O << "}\n\n";
 
   if (!MCOpPredicates.empty()) {
     O << "static bool " << Target.getName() << ClassName
       << "ValidateMCOperand(const MCOperand &MCOp,\n"
       << "                  const MCSubtargetInfo &STI,\n"
-      << "                  unsigned PredicateIndex) {\n"      
+      << "                  unsigned PredicateIndex) {\n"
       << "  switch (PredicateIndex) {\n"
       << "  default:\n"
       << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
       << "    break;\n";
 
     for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
-      StringRef MCOpPred = MCOpPredicates[i]->getValueAsString("MCOperandPredicate");
+      StringRef MCOpPred =
+          MCOpPredicates[i]->getValueAsString("MCOperandPredicate");
       O << "  case " << i + 1 << ": {\n"
         << MCOpPred.data() << "\n"
         << "    }\n";
diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp
index c9558593e142..1fa609e90005 100644
--- a/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -57,54 +57,55 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
   std::string::size_type LastEmitted = 0;
   while (LastEmitted != AsmString.size()) {
     std::string::size_type DollarPos =
-      AsmString.find_first_of("$\\", LastEmitted);
-    if (DollarPos == std::string::npos) DollarPos = AsmString.size();
+        AsmString.find_first_of("$\\", LastEmitted);
+    if (DollarPos == std::string::npos)
+      DollarPos = AsmString.size();
 
     // Emit a constant string fragment.
     if (DollarPos != LastEmitted) {
       for (; LastEmitted != DollarPos; ++LastEmitted)
         switch (AsmString[LastEmitted]) {
-          case '\n':
-            AddLiteralString("\\n");
-            break;
-          case '\t':
-            AddLiteralString("\\t");
-            break;
-          case '"':
-            AddLiteralString("\\\"");
-            break;
-          case '\\':
-            AddLiteralString("\\\\");
-            break;
-          default:
-            AddLiteralString(std::string(1, AsmString[LastEmitted]));
-            break;
+        case '\n':
+          AddLiteralString("\\n");
+          break;
+        case '\t':
+          AddLiteralString("\\t");
+          break;
+        case '"':
+          AddLiteralString("\\\"");
+          break;
+        case '\\':
+          AddLiteralString("\\\\");
+          break;
+        default:
+          AddLiteralString(std::string(1, AsmString[LastEmitted]));
+          break;
         }
     } else if (AsmString[DollarPos] == '\\') {
-      if (DollarPos+1 != AsmString.size()) {
-        if (AsmString[DollarPos+1] == 'n') {
+      if (DollarPos + 1 != AsmString.size()) {
+        if (AsmString[DollarPos + 1] == 'n') {
           AddLiteralString("\\n");
-        } else if (AsmString[DollarPos+1] == 't') {
+        } else if (AsmString[DollarPos + 1] == 't') {
           AddLiteralString("\\t");
-        } else if (std::string("${|}\\").find(AsmString[DollarPos+1])
-                   != std::string::npos) {
-          AddLiteralString(std::string(1, AsmString[DollarPos+1]));
+        } else if (std::string("${|}\\").find(AsmString[DollarPos + 1]) !=
+                   std::string::npos) {
+          AddLiteralString(std::string(1, AsmString[DollarPos + 1]));
         } else {
           PrintFatalError(
               CGI.TheDef->getLoc(),
               "Non-supported escaped character found in instruction '" +
                   CGI.TheDef->getName() + "'!");
         }
-        LastEmitted = DollarPos+2;
+        LastEmitted = DollarPos + 2;
         continue;
       }
-    } else if (DollarPos+1 != AsmString.size() &&
-               AsmString[DollarPos+1] == '$') {
-      AddLiteralString("$");  // "$$" -> $
-      LastEmitted = DollarPos+2;
+    } else if (DollarPos + 1 != AsmString.size() &&
+               AsmString[DollarPos + 1] == '$') {
+      AddLiteralString("$"); // "$$" -> $
+      LastEmitted = DollarPos + 2;
     } else {
       // Get the name of the variable.
-      std::string::size_type VarEnd = DollarPos+1;
+      std::string::size_type VarEnd = DollarPos + 1;
 
       // handle ${foo}bar as $foo by detecting whether the character following
       // the dollar sign is a curly brace.  If so, advance VarEnd and DollarPos
@@ -118,7 +119,8 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
 
       while (VarEnd < AsmString.size() && isIdentChar(AsmString[VarEnd]))
         ++VarEnd;
-      StringRef VarName(AsmString.data()+DollarPos+1, VarEnd-DollarPos-1);
+      StringRef VarName(AsmString.data() + DollarPos + 1,
+                        VarEnd - DollarPos - 1);
 
       // Modifier - Support ${foo:modifier} syntax, where "modifier" is passed
       // into printOperand.  Also support ${:feature}, which is passed into
@@ -190,13 +192,14 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
 /// specified instruction except for one differing operand, return the differing
 /// operand number.  If more than one operand mismatches, return ~1, otherwise
 /// if the instructions are identical return ~0.
-unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other)const{
-  if (Operands.size() != Other.Operands.size()) return ~1;
+unsigned AsmWriterInst::MatchesAllButOneOp(const AsmWriterInst &Other) const {
+  if (Operands.size() != Other.Operands.size())
+    return ~1;
 
   unsigned MismatchOperand = ~0U;
   for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
     if (Operands[i] != Other.Operands[i]) {
-      if (MismatchOperand != ~0U)  // Already have one mismatch?
+      if (MismatchOperand != ~0U) // Already have one mismatch?
         return ~1U;
       MismatchOperand = i;
     }
diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h
index 9c93e82b611b..f0ebf799d910 100644
--- a/llvm/utils/TableGen/AsmWriterInst.h
+++ b/llvm/utils/TableGen/AsmWriterInst.h
@@ -20,88 +20,88 @@
 #include <vector>
 
 namespace llvm {
-  class CodeGenInstruction;
-
-  struct AsmWriterOperand {
-    enum OpType {
-      // Output this text surrounded by quotes to the asm.
-      isLiteralTextOperand,
-      // This is the name of a routine to call to print the operand.
-      isMachineInstrOperand,
-      // Output this text verbatim to the asm writer.  It is code that
-      // will output some text to the asm.
-      isLiteralStatementOperand
-    } OperandType;
-
-    /// MiOpNo - For isMachineInstrOperand, this is the operand number of the
-    /// machine instruction.
-    unsigned MIOpNo = 0;
-
-    /// Str - For isLiteralTextOperand, this IS the literal text.  For
-    /// isMachineInstrOperand, this is the PrinterMethodName for the operand..
-    /// For isLiteralStatementOperand, this is the code to insert verbatim
-    /// into the asm writer.
-    std::string Str;
-
-    /// MiModifier - For isMachineInstrOperand, this is the modifier string for
-    /// an operand, specified with syntax like ${opname:modifier}.
-    std::string MiModifier;
-
-    bool PCRel = false;
-
-    // To make VS STL happy
-    AsmWriterOperand(OpType op = isLiteralTextOperand):OperandType(op) {}
-
-    AsmWriterOperand(const std::string &LitStr,
-                     OpType op = isLiteralTextOperand)
-    : OperandType(op), Str(LitStr) {}
-
-    AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo,
-                     const std::string &Modifier,
-                     OpType op = isMachineInstrOperand, bool PCRel = false)
-        : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier),
-          PCRel(PCRel) {}
-
-    bool operator!=(const AsmWriterOperand &Other) const {
-      if (OperandType != Other.OperandType || Str != Other.Str) return true;
-      if (OperandType == isMachineInstrOperand)
-        return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier ||
-               PCRel != Other.PCRel;
-      return false;
-    }
-    bool operator==(const AsmWriterOperand &Other) const {
-      return !operator!=(Other);
-    }
-
-    /// getCode - Return the code that prints this operand.
-    std::string getCode(bool PassSubtarget) const;
-  };
-
-  class AsmWriterInst {
-  public:
-    std::vector<AsmWriterOperand> Operands;
-    const CodeGenInstruction *CGI;
-    unsigned CGIIndex;
-
-    AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
-                  unsigned Variant);
-
-    /// MatchesAllButOneOp - If this instruction is exactly identical to the
-    /// specified instruction except for one differing operand, return the
-    /// differing operand number.  Otherwise return ~0.
-    unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const;
-
-  private:
-    void AddLiteralString(const std::string &Str) {
-      // If the last operand was already a literal text string, append this to
-      // it, otherwise add a new operand.
-      if (!Operands.empty() &&
-          Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand)
-        Operands.back().Str.append(Str);
-      else
-        Operands.push_back(AsmWriterOperand(Str));
-    }
-  };
-}
+class CodeGenInstruction;
+
+struct AsmWriterOperand {
+  enum OpType {
+    // Output this text surrounded by quotes to the asm.
+    isLiteralTextOperand,
+    // This is the name of a routine to call to print the operand.
+    isMachineInstrOperand,
+    // Output this text verbatim to the asm writer.  It is code that
+    // will output some text to the asm.
+    isLiteralStatementOperand
+  } OperandType;
+
+  /// MiOpNo - For isMachineInstrOperand, this is the operand number of the
+  /// machine instruction.
+  unsigned MIOpNo = 0;
+
+  /// Str - For isLiteralTextOperand, this IS the literal text.  For
+  /// isMachineInstrOperand, this is the PrinterMethodName for the operand..
+  /// For isLiteralStatementOperand, this is the code to insert verbatim
+  /// into the asm writer.
+  std::string Str;
+
+  /// MiModifier - For isMachineInstrOperand, this is the modifier string for
+  /// an operand, specified with syntax like ${opname:modifier}.
+  std::string MiModifier;
+
+  bool PCRel = false;
+
+  // To make VS STL happy
+  AsmWriterOperand(OpType op = isLiteralTextOperand) : OperandType(op) {}
+
+  AsmWriterOperand(const std::string &LitStr, OpType op = isLiteralTextOperand)
+      : OperandType(op), Str(LitStr) {}
+
+  AsmWriterOperand(const std::string &Printer, unsigned _MIOpNo,
+                   const std::string &Modifier,
+                   OpType op = isMachineInstrOperand, bool PCRel = false)
+      : OperandType(op), MIOpNo(_MIOpNo), Str(Printer), MiModifier(Modifier),
+        PCRel(PCRel) {}
+
+  bool operator!=(const AsmWriterOperand &Other) const {
+    if (OperandType != Other.OperandType || Str != Other.Str)
+      return true;
+    if (OperandType == isMachineInstrOperand)
+      return MIOpNo != Other.MIOpNo || MiModifier != Other.MiModifier ||
+             PCRel != Other.PCRel;
+    return false;
+  }
+  bool operator==(const AsmWriterOperand &Other) const {
+    return !operator!=(Other);
+  }
+
+  /// getCode - Return the code that prints this operand.
+  std::string getCode(bool PassSubtarget) const;
+};
+
+class AsmWriterInst {
+public:
+  std::vector<AsmWriterOperand> Operands;
+  const CodeGenInstruction *CGI;
+  unsigned CGIIndex;
+
+  AsmWriterInst(const CodeGenInstruction &CGI, unsigned CGIIndex,
+                unsigned Variant);
+
+  /// MatchesAllButOneOp - If this instruction is exactly identical to the
+  /// specified instruction except for one differing operand, return the
+  /// differing operand number.  Otherwise return ~0.
+  unsigned MatchesAllButOneOp(const AsmWriterInst &Other) const;
+
+private:
+  void AddLiteralString(const std::string &Str) {
+    // If the last operand was already a literal text string, append this to
+    // it, otherwise add a new operand.
+    if (!Operands.empty() &&
+        Operands.back().OperandType == AsmWriterOperand::isLiteralTextOperand)
+      Operands.back().Str.append(Str);
+    else
+      Operands.push_back(AsmWriterOperand(Str));
+  }
+};
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp
index b8e27d057d95..bda18936dc77 100644
--- a/llvm/utils/TableGen/CTagsEmitter.cpp
+++ b/llvm/utils/TableGen/CTagsEmitter.cpp
@@ -1,4 +1,4 @@
-//===- CTagsEmitter.cpp - Generate ctags-compatible index ------------------===//
+//===- CTagsEmitter.cpp - Generate ctags-compatible index -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,6 +30,7 @@ private:
   StringRef Id;
   StringRef BufferIdentifier;
   unsigned Line;
+
 public:
   Tag(StringRef Name, const SMLoc Location) : Id(Name) {
     const MemoryBuffer *CurMB =
@@ -39,7 +40,8 @@ public:
     Line = LineAndColumn.first;
   }
   int operator<(const Tag &B) const {
-    return std::make_tuple(Id, BufferIdentifier, Line) < std::make_tuple(B.Id, B.BufferIdentifier, B.Line);
+    return std::make_tuple(Id, BufferIdentifier, Line) <
+           std::make_tuple(B.Id, B.BufferIdentifier, B.Line);
   }
   void emit(raw_ostream &OS) const {
     OS << Id << "\t" << BufferIdentifier << "\t" << Line << "\n";
@@ -49,6 +51,7 @@ public:
 class CTagsEmitter {
 private:
   RecordKeeper &Records;
+
 public:
   CTagsEmitter(RecordKeeper &R) : Records(R) {}
 
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index 02e7000130cd..3c3a2874ce80 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -117,23 +117,24 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
     O << "\n";
     EmitAction(Action, 2, O);
   }
-  
+
   O << "\n  return true; // CC didn't match.\n";
   O << "}\n";
 }
 
-void CallingConvEmitter::EmitAction(Record *Action,
-                                    unsigned Indent, raw_ostream &O) {
+void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
+                                    raw_ostream &O) {
   std::string IndentStr = std::string(Indent, ' ');
 
   if (Action->isSubClassOf("CCPredicateAction")) {
     O << IndentStr << "if (";
-    
+
     if (Action->isSubClassOf("CCIfType")) {
       ListInit *VTs = Action->getValueAsListInit("VTs");
       for (unsigned i = 0, e = VTs->size(); i != e; ++i) {
         Record *VT = VTs->getElementAsRecord(i);
-        if (i != 0) O << " ||\n    " << IndentStr;
+        if (i != 0)
+          O << " ||\n    " << IndentStr;
         O << "LocVT == " << getEnumName(getValueType(VT));
       }
 
@@ -143,9 +144,9 @@ void CallingConvEmitter::EmitAction(Record *Action,
       errs() << *Action;
       PrintFatalError(Action->getLoc(), "Unknown CCPredicateAction!");
     }
-    
+
     O << ") {\n";
-    EmitAction(Action->getValueAsDef("SubAction"), Indent+2, O);
+    EmitAction(Action->getValueAsDef("SubAction"), Indent + 2, O);
     O << IndentStr << "}\n";
   } else {
     if (Action->isSubClassOf("CCDelegateTo")) {
@@ -241,8 +242,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
         O << "\n" << IndentStr << "};\n";
 
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
-          << RegListNumber << ", " << "RegList" << ShadowRegListNumber
-          << ")) {\n";
+          << RegListNumber << ", "
+          << "RegList" << ShadowRegListNumber << ")) {\n";
       }
       O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
         << "Reg, LocVT, LocInfo));\n";
@@ -257,7 +258,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
       if (Size)
         O << Size << ", ";
       else
-        O << "\n" << IndentStr
+        O << "\n"
+          << IndentStr
           << "  State.getMachineFunction().getDataLayout()."
              "getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
              " ";
@@ -269,8 +271,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
           << "  State.getMachineFunction().getDataLayout()."
              "getABITypeAlign(EVT(LocVT).getTypeForEVT(State.getContext()"
              "))";
-      O << ");\n" << IndentStr
-        << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
+      O << ");\n"
+        << IndentStr << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
       O << IndentStr << "return false;\n";
     } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) {
@@ -281,7 +283,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       unsigned ShadowRegListNumber = ++Counter;
 
       O << IndentStr << "static const MCPhysReg ShadowRegList"
-          << ShadowRegListNumber << "[] = {\n";
+        << ShadowRegListNumber << "[] = {\n";
       O << IndentStr << "  ";
       ListSeparator LS;
       for (unsigned i = 0, e = ShadowRegList->size(); i != e; ++i)
@@ -297,7 +299,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
     } else if (Action->isSubClassOf("CCPromoteToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
       MVT::SimpleValueType DestVT = getValueType(DestTy);
-      O << IndentStr << "LocVT = " << getEnumName(DestVT) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n";
       if (MVT(DestVT).isFloatingPoint()) {
         O << IndentStr << "LocInfo = CCValAssign::FPExt;\n";
       } else {
@@ -326,15 +328,18 @@ void CallingConvEmitter::EmitAction(Record *Action,
       }
     } else if (Action->isSubClassOf("CCBitConvertToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::BCvt;\n";
     } else if (Action->isSubClassOf("CCTruncToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::Trunc;\n";
     } else if (Action->isSubClassOf("CCPassIndirect")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
-      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy))
+        << ";\n";
       O << IndentStr << "LocInfo = CCValAssign::Indirect;\n";
     } else if (Action->isSubClassOf("CCPassByVal")) {
       int Size = Action->getValueAsInt("Size");
@@ -343,8 +348,8 @@ void CallingConvEmitter::EmitAction(Record *Action,
         << Size << ", Align(" << Align << "), ArgFlags);\n";
       O << IndentStr << "return false;\n";
     } else if (Action->isSubClassOf("CCCustom")) {
-      O << IndentStr
-        << "if (" << Action->getValueAsString("FuncName") << "(ValNo, ValVT, "
+      O << IndentStr << "if (" << Action->getValueAsString("FuncName")
+        << "(ValNo, ValVT, "
         << "LocVT, LocInfo, ArgFlags, State))\n";
       O << IndentStr << "  return false;\n";
     } else {
@@ -376,9 +381,8 @@ void CallingConvEmitter::EmitArgRegisterLists(raw_ostream &O) {
         std::set<std::string> &InnerRegisters = InnerEntry.second;
 
         if (InnerRegisters.find(CCName) != InnerRegisters.end()) {
-          AssignedRegsMap[InnerCCName].insert(
-              AssignedRegsMap[CCName].begin(),
-              AssignedRegsMap[CCName].end());
+          AssignedRegsMap[InnerCCName].insert(AssignedRegsMap[CCName].begin(),
+                                              AssignedRegsMap[CCName].end());
           InnerRegisters.erase(CCName);
         }
       }
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 48ed319bf06f..d7020d1d78b0 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -78,8 +78,8 @@ private:
 
 // If the VarBitInit at position 'bit' matches the specified variable then
 // return the variable bit position.  Otherwise return -1.
-int CodeEmitterGen::getVariableBit(const std::string &VarName,
-                                   BitsInit *BI, int bit) {
+int CodeEmitterGen::getVariableBit(const std::string &VarName, BitsInit *BI,
+                                   int bit) {
   if (VarBitInit *VBI = dyn_cast<VarBitInit>(BI->getBit(bit))) {
     if (VarInit *VI = dyn_cast<VarInit>(VBI->getBitVar()))
       if (VI->getName() == VarName)
@@ -101,16 +101,16 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
   CodeGenInstruction &CGI = Target.getInstruction(R);
 
   // Determine if VarName actually contributes to the Inst encoding.
-  int bit = BI->getNumBits()-1;
+  int bit = BI->getNumBits() - 1;
 
   // Scan for a bit that this contributed to.
-  for (; bit >= 0; ) {
+  for (; bit >= 0;) {
     if (getVariableBit(VarName, BI, bit) != -1)
       break;
-    
+
     --bit;
   }
-  
+
   // If we found no bits, ignore this value, otherwise emit the call to get the
   // operand encoding.
   if (bit < 0)
@@ -127,12 +127,14 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     // Get the machine operand number for the indicated operand.
     OpIdx = CGI.Operands[OpIdx].MIOperandNo;
   } else {
-    PrintError(R, Twine("No operand named ") + VarName + " in record " + R->getName());
+    PrintError(R, Twine("No operand named ") + VarName + " in record " +
+                      R->getName());
     return false;
   }
 
   if (CGI.Operands.isFlatOperandNotEmitted(OpIdx)) {
-    PrintError(R, "Operand " + VarName + " used but also marked as not emitted!");
+    PrintError(R,
+               "Operand " + VarName + " used but also marked as not emitted!");
     return false;
   }
 
@@ -156,10 +158,12 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     Case += ", Fixups, STI);\n";
   } else {
     if (UseAPInt) {
-      Case += "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case +=
+          "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
       Case += ", op, Fixups, STI";
     } else {
-      Case += "      op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case += "      op = getMachineOpValue(MI, MI.getOperand(" +
+              utostr(OpIdx) + ")";
       Case += ", Fixups, STI";
     }
     Case += ");\n";
@@ -193,9 +197,9 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
   }
 
   unsigned BitOffset = -1;
-  for (; bit >= 0; ) {
+  for (; bit >= 0;) {
     int varBit = getVariableBit(VarName, BI, bit);
-    
+
     // If this bit isn't from a variable, skip it.
     if (varBit == -1) {
       --bit;
@@ -209,7 +213,8 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI,
     int N = 1;
     for (--bit; bit >= 0;) {
       varBit = getVariableBit(VarName, BI, bit);
-      if (varBit == -1 || varBit != (beginVarBit - N)) break;
+      if (varBit == -1 || varBit != (beginVarBit - N))
+        break;
       ++N;
       --bit;
     }
@@ -368,7 +373,9 @@ void CodeEmitterGen::emitInstructionBaseValues(
 
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo")) {
-      o << "    "; emitInstBits(o, APInt(BitWidth, 0)); o << ",\n";
+      o << "    ";
+      emitInstBits(o, APInt(BitWidth, 0));
+      o << ",\n";
       continue;
     }
 
@@ -419,13 +426,13 @@ void CodeEmitterGen::run(raw_ostream &o) {
   emitSourceFileHeader("Machine Code Emitter", o);
 
   CodeGenTarget Target(Records);
-  std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("Instruction");
 
   // For little-endian instruction bit encodings, reverse the bit order
   Target.reverseBitsForLittleEndianEncoding();
 
-  ArrayRef<const CodeGenInstruction*> NumberedInstructions =
-    Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
 
   if (any_of(NumberedInstructions, [](const CodeGenInstruction *CGI) {
         Record *R = CGI->TheDef;
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index f88e25ea1d16..a9046e09a629 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -38,18 +38,10 @@ using namespace llvm;
 static inline bool isIntegerOrPtr(MVT VT) {
   return VT.isInteger() || VT == MVT::iPTR;
 }
-static inline bool isFloatingPoint(MVT VT) {
-  return VT.isFloatingPoint();
-}
-static inline bool isVector(MVT VT) {
-  return VT.isVector();
-}
-static inline bool isScalar(MVT VT) {
-  return !VT.isVector();
-}
-static inline bool isScalarInteger(MVT VT) {
-  return VT.isScalarInteger();
-}
+static inline bool isFloatingPoint(MVT VT) { return VT.isFloatingPoint(); }
+static inline bool isVector(MVT VT) { return VT.isVector(); }
+static inline bool isScalar(MVT VT) { return !VT.isVector(); }
+static inline bool isScalarInteger(MVT VT) { return VT.isScalarInteger(); }
 
 template <typename Predicate>
 static bool berase_if(MachineValueTypeSet &S, Predicate P) {
@@ -173,8 +165,7 @@ bool TypeSetByHwMode::constrain(const TypeSetByHwMode &VTS) {
   return Changed;
 }
 
-template <typename Predicate>
-bool TypeSetByHwMode::constrain(Predicate P) {
+template <typename Predicate> bool TypeSetByHwMode::constrain(Predicate P) {
   bool Changed = false;
   for (auto &I : *this)
     Changed |= berase_if(I.second, [&P](MVT VT) { return !P(VT); });
@@ -257,20 +248,18 @@ bool TypeSetByHwMode::operator==(const TypeSetByHwMode &VTS) const {
 }
 
 namespace llvm {
-  raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
-  raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
+  T.writeToStream(OS);
+  return OS;
+}
+raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
 }
+} // namespace llvm
 
 LLVM_DUMP_METHOD
-void TypeSetByHwMode::dump() const {
-  dbgs() << *this << '\n';
-}
+void TypeSetByHwMode::dump() const { dbgs() << *this << '\n'; }
 
 bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) {
   bool OutP = Out.count(MVT::iPTR), InP = In.count(MVT::iPTR);
@@ -335,7 +324,7 @@ bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) {
   // OutP == true
   SetType InOnly = subtract(In, Out);
   unsigned SizeOut = Out.size();
-  berase_if(Out, CompIn);   // This will remove at least the iPTR.
+  berase_if(Out, CompIn); // This will remove at least the iPTR.
   unsigned NumI = llvm::count_if(InOnly, isScalarInteger);
   if (NumI == 0) {
     // iPTR deleted from Out.
@@ -568,29 +557,29 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big,
     // smaller-or-equal than MinS.
     auto MinS = min_if(S.begin(), S.end(), isScalar, LT);
     if (MinS != S.end())
-      Changed |= berase_if(B, std::bind(SameKindLE,
-                                        std::placeholders::_1, *MinS));
+      Changed |=
+          berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinS));
 
     // MaxS = max scalar in Big, remove all scalars from Small that are
     // larger than MaxS.
     auto MaxS = max_if(B.begin(), B.end(), isScalar, LT);
     if (MaxS != B.end())
-      Changed |= berase_if(S, std::bind(SameKindLE,
-                                        *MaxS, std::placeholders::_1));
+      Changed |=
+          berase_if(S, std::bind(SameKindLE, *MaxS, std::placeholders::_1));
 
     // MinV = min vector in Small, remove all vectors from Big that are
     // smaller-or-equal than MinV.
     auto MinV = min_if(S.begin(), S.end(), isVector, LT);
     if (MinV != S.end())
-      Changed |= berase_if(B, std::bind(SameKindLE,
-                                        std::placeholders::_1, *MinV));
+      Changed |=
+          berase_if(B, std::bind(SameKindLE, std::placeholders::_1, *MinV));
 
     // MaxV = max vector in Big, remove all vectors from Small that are
     // larger than MaxV.
     auto MaxV = max_if(B.begin(), B.end(), isVector, LT);
     if (MaxV != B.end())
-      Changed |= berase_if(S, std::bind(SameKindLE,
-                                        *MaxV, std::placeholders::_1));
+      Changed |=
+          berase_if(S, std::bind(SameKindLE, *MaxV, std::placeholders::_1));
   }
 
   return Changed;
@@ -618,8 +607,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
     TypeSetByHwMode::SetType &V = Vec.get(M);
     TypeSetByHwMode::SetType &E = Elem.get(M);
 
-    Changed |= berase_if(V, isScalar);  // Scalar = !vector
-    Changed |= berase_if(E, isVector);  // Vector = !scalar
+    Changed |= berase_if(V, isScalar); // Scalar = !vector
+    Changed |= berase_if(E, isVector); // Vector = !scalar
     assert(!V.empty() && !E.empty());
 
     MachineValueTypeSet VT, ST;
@@ -632,8 +621,8 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
 
     // Remove from V all (vector) types whose element type is not in S.
     Changed |= berase_if(V, [&ST](MVT T) -> bool {
-                              return !ST.count(T.getVectorElementType());
-                            });
+      return !ST.count(T.getVectorElementType());
+    });
     // Remove from E all (scalar) types, for which there is no corresponding
     // type in V.
     Changed |= berase_if(E, [&VT](MVT T) -> bool { return !VT.count(T); });
@@ -887,7 +876,6 @@ TypeInfer::ValidateOnExit::~ValidateOnExit() {
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 // ScopedName Implementation
 //===----------------------------------------------------------------------===//
@@ -896,10 +884,7 @@ bool ScopedName::operator==(const ScopedName &o) const {
   return Scope == o.Scope && Identifier == o.Identifier;
 }
 
-bool ScopedName::operator!=(const ScopedName &o) const {
-  return !(*this == o);
-}
-
+bool ScopedName::operator!=(const ScopedName &o) const { return !(*this == o); }
 
 //===----------------------------------------------------------------------===//
 // TreePredicateFn Implementation
@@ -1011,8 +996,9 @@ std::string TreePredicateFn::getPredCode() const {
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsAtomicOrderingAcquireRelease requires IsAtomic");
     if (isAtomicOrderingSequentiallyConsistent())
-      PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
-                      "IsAtomicOrderingSequentiallyConsistent requires IsAtomic");
+      PrintFatalError(
+          getOrigPatFragRecord()->getRecord()->getLoc(),
+          "IsAtomicOrderingSequentiallyConsistent requires IsAtomic");
     if (isAtomicOrderingAcquireOrStronger())
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsAtomicOrderingAcquireOrStronger requires IsAtomic");
@@ -1027,7 +1013,7 @@ std::string TreePredicateFn::getPredCode() const {
   if (isLoad() || isStore() || isAtomic()) {
     if (ListInit *AddressSpaces = getAddressSpaces()) {
       Code += "unsigned AddrSpace = cast<MemSDNode>(N)->getAddressSpace();\n"
-        " if (";
+              " if (";
 
       ListSeparator LS(" && ");
       for (Init *Val : AddressSpaces->getValues()) {
@@ -1077,18 +1063,22 @@ std::string TreePredicateFn::getPredCode() const {
             "AtomicOrdering::SequentiallyConsistent) return false;\n";
 
   if (isAtomic() && isAtomicOrderingAcquireOrStronger())
-    Code += "if (!isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (!isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
   if (isAtomic() && isAtomicOrderingWeakerThanAcquire())
-    Code += "if (isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (isAcquireOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
 
   if (isAtomic() && isAtomicOrderingReleaseOrStronger())
-    Code += "if (!isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (!isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
   if (isAtomic() && isAtomicOrderingWeakerThanRelease())
-    Code += "if (isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
-            "return false;\n";
+    Code +=
+        "if (isReleaseOrStronger(cast<AtomicSDNode>(N)->getMergedOrdering())) "
+        "return false;\n";
 
   // TODO: Handle atomic sextload/zextload normally when ATOMIC_LOAD is removed.
   if (isAtomic() && (isZeroExtLoad() || isSignExtLoad()))
@@ -1239,16 +1229,20 @@ bool TreePredicateFn::isAtomicOrderingSequentiallyConsistent() const {
                                       true);
 }
 bool TreePredicateFn::isAtomicOrderingAcquireOrStronger() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", true);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger",
+                                      true);
 }
 bool TreePredicateFn::isAtomicOrderingWeakerThanAcquire() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger", false);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingAcquireOrStronger",
+                                      false);
 }
 bool TreePredicateFn::isAtomicOrderingReleaseOrStronger() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", true);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger",
+                                      true);
 }
 bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const {
-  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger", false);
+  return isPredefinedPredicateEqualTo("IsAtomicOrderingReleaseOrStronger",
+                                      false);
 }
 Record *TreePredicateFn::getMemoryVT() const {
   Record *R = getOrigPatFragRecord()->getRecord();
@@ -1412,10 +1406,10 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const {
 // PatternToMatch implementation
 //
 
-static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) {
-  if (!P->isLeaf())
+static bool isImmAllOnesAllZerosMatch(const TreePatternNode &P) {
+  if (!P.isLeaf())
     return false;
-  DefInit *DI = dyn_cast<DefInit>(P->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(P.getLeafValue());
   if (!DI)
     return false;
 
@@ -1426,15 +1420,15 @@ static bool isImmAllOnesAllZerosMatch(const TreePatternNode *P) {
 /// getPatternSize - Return the 'size' of this pattern.  We want to match large
 /// patterns before small ones.  This is used to determine the size of a
 /// pattern.
-static unsigned getPatternSize(const TreePatternNode *P,
+static unsigned getPatternSize(const TreePatternNode &P,
                                const CodeGenDAGPatterns &CGP) {
-  unsigned Size = 3;  // The node itself.
+  unsigned Size = 3; // The node itself.
   // If the root node is a ConstantSDNode, increases its size.
   // e.g. (set R32:$dst, 0).
-  if (P->isLeaf() && isa<IntInit>(P->getLeafValue()))
+  if (P.isLeaf() && isa<IntInit>(P.getLeafValue()))
     Size += 2;
 
-  if (const ComplexPattern *AM = P->getComplexPatternInfo(CGP)) {
+  if (const ComplexPattern *AM = P.getComplexPatternInfo(CGP)) {
     Size += AM->getComplexity();
     // We don't want to count any children twice, so return early.
     return Size;
@@ -1442,14 +1436,14 @@ static unsigned getPatternSize(const TreePatternNode *P,
 
   // If this node has some predicate function that must match, it adds to the
   // complexity of this node.
-  if (!P->getPredicateCalls().empty())
+  if (!P.getPredicateCalls().empty())
     ++Size;
 
   // Count children in the count if they are also nodes.
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i) {
-    const TreePatternNode *Child = P->getChild(i);
-    if (!Child->isLeaf() && Child->getNumTypes()) {
-      const TypeSetByHwMode &T0 = Child->getExtType(0);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i) {
+    const TreePatternNode &Child = P.getChild(i);
+    if (!Child.isLeaf() && Child.getNumTypes()) {
+      const TypeSetByHwMode &T0 = Child.getExtType(0);
       // At this point, all variable type sets should be simple, i.e. only
       // have a default mode.
       if (T0.getMachineValueType() != MVT::Other) {
@@ -1457,14 +1451,14 @@ static unsigned getPatternSize(const TreePatternNode *P,
         continue;
       }
     }
-    if (Child->isLeaf()) {
-      if (isa<IntInit>(Child->getLeafValue()))
-        Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
-      else if (Child->getComplexPatternInfo(CGP))
+    if (Child.isLeaf()) {
+      if (isa<IntInit>(Child.getLeafValue()))
+        Size += 5; // Matches a ConstantSDNode (+3) and a specific value (+2).
+      else if (Child.getComplexPatternInfo(CGP))
         Size += getPatternSize(Child, CGP);
       else if (isImmAllOnesAllZerosMatch(Child))
         Size += 4; // Matches a build_vector(+3) and a predicate (+1).
-      else if (!Child->getPredicateCalls().empty())
+      else if (!Child.getPredicateCalls().empty())
         ++Size;
     }
   }
@@ -1474,8 +1468,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
 
 /// Compute the complexity metric for the input pattern.  This roughly
 /// corresponds to the number of nodes that are covered.
-int PatternToMatch::
-getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
+int PatternToMatch::getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
   return getPatternSize(getSrcPattern(), CGP) + getAddedComplexity();
 }
 
@@ -1550,18 +1543,17 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
   } else if (R->isSubClassOf("SDTCisVTSmallerThanOp")) {
     ConstraintType = SDTCisVTSmallerThanOp;
     x.SDTCisVTSmallerThanOp_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else if (R->isSubClassOf("SDTCisOpSmallerThanOp")) {
     ConstraintType = SDTCisOpSmallerThanOp;
     x.SDTCisOpSmallerThanOp_Info.BigOperandNum =
-      R->getValueAsInt("BigOperandNum");
+        R->getValueAsInt("BigOperandNum");
   } else if (R->isSubClassOf("SDTCisEltOfVec")) {
     ConstraintType = SDTCisEltOfVec;
     x.SDTCisEltOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum");
   } else if (R->isSubClassOf("SDTCisSubVecOfVec")) {
     ConstraintType = SDTCisSubVecOfVec;
-    x.SDTCisSubVecOfVec_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOpNum");
+    x.SDTCisSubVecOfVec_Info.OtherOperandNum = R->getValueAsInt("OtherOpNum");
   } else if (R->isSubClassOf("SDTCVecEltisVT")) {
     ConstraintType = SDTCVecEltisVT;
     VVT = getValueTypeByHwMode(R->getValueAsDef("VT"), CGH);
@@ -1577,11 +1569,11 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
   } else if (R->isSubClassOf("SDTCisSameNumEltsAs")) {
     ConstraintType = SDTCisSameNumEltsAs;
     x.SDTCisSameNumEltsAs_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else if (R->isSubClassOf("SDTCisSameSizeAs")) {
     ConstraintType = SDTCisSameSizeAs;
     x.SDTCisSameSizeAs_Info.OtherOperandNum =
-      R->getValueAsInt("OtherOperandNum");
+        R->getValueAsInt("OtherOperandNum");
   } else {
     PrintFatalError(R->getLoc(),
                     "Unrecognized SDTypeConstraint '" + R->getName() + "'!\n");
@@ -1590,7 +1582,7 @@ SDTypeConstraint::SDTypeConstraint(Record *R, const CodeGenHwModes &CGH) {
 
 /// getOperandNum - Return the node corresponding to operand #OpNo in tree
 /// N, and the result number in ResNo.
-static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
+static TreePatternNode &getOperandNum(unsigned OpNo, TreePatternNode &N,
                                       const SDNodeInfo &NodeInfo,
                                       unsigned &ResNo) {
   unsigned NumResults = NodeInfo.getNumResults();
@@ -1601,126 +1593,120 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
 
   OpNo -= NumResults;
 
-  if (OpNo >= N->getNumChildren()) {
+  if (OpNo >= N.getNumChildren()) {
     std::string S;
     raw_string_ostream OS(S);
-    OS << "Invalid operand number in type constraint "
-           << (OpNo+NumResults) << " ";
-    N->print(OS);
+    OS << "Invalid operand number in type constraint " << (OpNo + NumResults)
+       << " ";
+    N.print(OS);
     PrintFatalError(S);
   }
 
-  return N->getChild(OpNo);
+  return N.getChild(OpNo);
 }
 
 /// ApplyTypeConstraint - Given a node in a pattern, apply this type
 /// constraint to the nodes operands.  This returns true if it makes a
 /// change, false otherwise.  If a type contradiction is found, flag an error.
-bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
+bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode &N,
                                            const SDNodeInfo &NodeInfo,
                                            TreePattern &TP) const {
   if (TP.hasError())
     return false;
 
   unsigned ResNo = 0; // The result number being referenced.
-  TreePatternNode *NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
+  TreePatternNode &NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
   TypeInfer &TI = TP.getInfer();
 
   switch (ConstraintType) {
   case SDTCisVT:
     // Operand must be a particular type.
-    return NodeToApply->UpdateNodeType(ResNo, VVT, TP);
+    return NodeToApply.UpdateNodeType(ResNo, VVT, TP);
   case SDTCisPtrTy:
     // Operand must be same as target pointer type.
-    return NodeToApply->UpdateNodeType(ResNo, MVT::iPTR, TP);
+    return NodeToApply.UpdateNodeType(ResNo, MVT::iPTR, TP);
   case SDTCisInt:
     // Require it to be one of the legal integer VTs.
-     return TI.EnforceInteger(NodeToApply->getExtType(ResNo));
+    return TI.EnforceInteger(NodeToApply.getExtType(ResNo));
   case SDTCisFP:
     // Require it to be one of the legal fp VTs.
-    return TI.EnforceFloatingPoint(NodeToApply->getExtType(ResNo));
+    return TI.EnforceFloatingPoint(NodeToApply.getExtType(ResNo));
   case SDTCisVec:
     // Require it to be one of the legal vector VTs.
-    return TI.EnforceVector(NodeToApply->getExtType(ResNo));
+    return TI.EnforceVector(NodeToApply.getExtType(ResNo));
   case SDTCisSameAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
-    return (int)NodeToApply->UpdateNodeType(ResNo,
-                                            OtherNode->getExtType(OResNo), TP) |
-           (int)OtherNode->UpdateNodeType(OResNo,
-                                          NodeToApply->getExtType(ResNo), TP);
+    TreePatternNode &OtherNode =
+        getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
+    return (int)NodeToApply.UpdateNodeType(ResNo, OtherNode.getExtType(OResNo),
+                                           TP) |
+           (int)OtherNode.UpdateNodeType(OResNo, NodeToApply.getExtType(ResNo),
+                                         TP);
   }
   case SDTCisVTSmallerThanOp: {
     // The NodeToApply must be a leaf node that is a VT.  OtherOperandNum must
     // have an integer type that is smaller than the VT.
-    if (!NodeToApply->isLeaf() ||
-        !isa<DefInit>(NodeToApply->getLeafValue()) ||
-        !cast<DefInit>(NodeToApply->getLeafValue())->getDef()
-               ->isSubClassOf("ValueType")) {
-      TP.error(N->getOperator()->getName() + " expects a VT operand!");
+    if (!NodeToApply.isLeaf() || !isa<DefInit>(NodeToApply.getLeafValue()) ||
+        !cast<DefInit>(NodeToApply.getLeafValue())
+             ->getDef()
+             ->isSubClassOf("ValueType")) {
+      TP.error(N.getOperator()->getName() + " expects a VT operand!");
       return false;
     }
-    DefInit *DI = cast<DefInit>(NodeToApply->getLeafValue());
+    DefInit *DI = cast<DefInit>(NodeToApply.getLeafValue());
     const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo();
     auto VVT = getValueTypeByHwMode(DI->getDef(), T.getHwModes());
     TypeSetByHwMode TypeListTmp(VVT);
 
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo,
-                    OResNo);
+    TreePatternNode &OtherNode = getOperandNum(
+        x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo, OResNo);
 
-    return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo),
+    return TI.EnforceSmallerThan(TypeListTmp, OtherNode.getExtType(OResNo),
                                  /*SmallIsVT*/ true);
   }
   case SDTCisOpSmallerThanOp: {
     unsigned BResNo = 0;
-    TreePatternNode *BigOperand =
-      getOperandNum(x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo,
-                    BResNo);
-    return TI.EnforceSmallerThan(NodeToApply->getExtType(ResNo),
-                                 BigOperand->getExtType(BResNo));
+    TreePatternNode &BigOperand = getOperandNum(
+        x.SDTCisOpSmallerThanOp_Info.BigOperandNum, N, NodeInfo, BResNo);
+    return TI.EnforceSmallerThan(NodeToApply.getExtType(ResNo),
+                                 BigOperand.getExtType(BResNo));
   }
   case SDTCisEltOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *VecOperand =
-      getOperandNum(x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo,
-                    VResNo);
+    TreePatternNode &VecOperand = getOperandNum(
+        x.SDTCisEltOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
     // Filter vector types out of VecOperand that don't have the right element
     // type.
-    return TI.EnforceVectorEltTypeIs(VecOperand->getExtType(VResNo),
-                                     NodeToApply->getExtType(ResNo));
+    return TI.EnforceVectorEltTypeIs(VecOperand.getExtType(VResNo),
+                                     NodeToApply.getExtType(ResNo));
   }
   case SDTCisSubVecOfVec: {
     unsigned VResNo = 0;
-    TreePatternNode *BigVecOperand =
-      getOperandNum(x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo,
-                    VResNo);
+    TreePatternNode &BigVecOperand = getOperandNum(
+        x.SDTCisSubVecOfVec_Info.OtherOperandNum, N, NodeInfo, VResNo);
 
     // Filter vector types out of BigVecOperand that don't have the
     // right subvector type.
-    return TI.EnforceVectorSubVectorTypeIs(BigVecOperand->getExtType(VResNo),
-                                           NodeToApply->getExtType(ResNo));
+    return TI.EnforceVectorSubVectorTypeIs(BigVecOperand.getExtType(VResNo),
+                                           NodeToApply.getExtType(ResNo));
   }
   case SDTCVecEltisVT: {
-    return TI.EnforceVectorEltTypeIs(NodeToApply->getExtType(ResNo), VVT);
+    return TI.EnforceVectorEltTypeIs(NodeToApply.getExtType(ResNo), VVT);
   }
   case SDTCisSameNumEltsAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameNumEltsAs_Info.OtherOperandNum,
-                    N, NodeInfo, OResNo);
-    return TI.EnforceSameNumElts(OtherNode->getExtType(OResNo),
-                                 NodeToApply->getExtType(ResNo));
+    TreePatternNode &OtherNode = getOperandNum(
+        x.SDTCisSameNumEltsAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
+    return TI.EnforceSameNumElts(OtherNode.getExtType(OResNo),
+                                 NodeToApply.getExtType(ResNo));
   }
   case SDTCisSameSizeAs: {
     unsigned OResNo = 0;
-    TreePatternNode *OtherNode =
-      getOperandNum(x.SDTCisSameSizeAs_Info.OtherOperandNum,
-                    N, NodeInfo, OResNo);
-    return TI.EnforceSameSize(OtherNode->getExtType(OResNo),
-                              NodeToApply->getExtType(ResNo));
+    TreePatternNode &OtherNode = getOperandNum(
+        x.SDTCisSameSizeAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
+    return TI.EnforceSameSize(OtherNode.getExtType(OResNo),
+                              NodeToApply.getExtType(ResNo));
   }
   }
   llvm_unreachable("Invalid ConstraintType!");
@@ -1729,8 +1715,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
 // Update the node type to match an instruction operand or result as specified
 // in the ins or outs lists on the instruction definition. Return true if the
 // type was actually changed.
-bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo,
-                                             Record *Operand,
+bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo, Record *Operand,
                                              TreePattern &TP) {
   // The 'unknown' operand indicates that types should be inferred from the
   // context.
@@ -1766,7 +1751,7 @@ bool TreePatternNode::ContainsUnresolvedType(TreePattern &TP) const {
     if (!TP.getInfer().isConcrete(Types[i], true))
       return true;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (getChild(i)->ContainsUnresolvedType(TP))
+    if (getChild(i).ContainsUnresolvedType(TP))
       return true;
   return false;
 }
@@ -1808,7 +1793,7 @@ bool TreePatternNode::setDefaultMode(unsigned Mode) {
 // SDNodeInfo implementation
 //
 SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) {
-  EnumName    = R->getValueAsString("Opcode");
+  EnumName = R->getValueAsString("Opcode");
   SDClassName = R->getValueAsString("SDClass");
   Record *TypeProfile = R->getValueAsDef("TypeProfile");
   NumResults = TypeProfile->getValueAsInt("NumResults");
@@ -1818,8 +1803,8 @@ SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) {
   Properties = parseSDPatternOperatorProperties(R);
 
   // Parse the type constraints.
-  std::vector<Record*> ConstraintList =
-    TypeProfile->getValueAsListOfDefs("Constraints");
+  std::vector<Record *> ConstraintList =
+      TypeProfile->getValueAsListOfDefs("Constraints");
   for (Record *R : ConstraintList)
     TypeConstraints.emplace_back(R, CGH);
 }
@@ -1835,11 +1820,12 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 
   for (const SDTypeConstraint &Constraint : TypeConstraints) {
     // Make sure that this applies to the correct node result.
-    if (Constraint.OperandNo >= NumResults)  // FIXME: need value #
+    if (Constraint.OperandNo >= NumResults) // FIXME: need value #
       continue;
 
     switch (Constraint.ConstraintType) {
-    default: break;
+    default:
+      break;
     case SDTypeConstraint::SDTCisVT:
       if (Constraint.VVT.isSimple())
         return Constraint.VVT.getSimple().SimpleTy;
@@ -1856,9 +1842,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
 //
 
 static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
-  if (Operator->getName() == "set" ||
-      Operator->getName() == "implicit")
-    return 0;  // All return nothing.
+  if (Operator->getName() == "set" || Operator->getName() == "implicit")
+    return 0; // All return nothing.
 
   if (Operator->isSubClassOf("Intrinsic"))
     return CDP.getIntrinsic(Operator).IS.RetTys.size();
@@ -1908,16 +1893,17 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
     }
 
     // Add on one implicit def if it has a resolvable type.
-    if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=MVT::Other)
+    if (InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo()) !=
+        MVT::Other)
       ++NumDefsToAdd;
     return NumDefsToAdd;
   }
 
   if (Operator->isSubClassOf("SDNodeXForm"))
-    return 1;  // FIXME: Generalize SDNodeXForm
+    return 1; // FIXME: Generalize SDNodeXForm
 
   if (Operator->isSubClassOf("ValueType"))
-    return 1;  // A type-cast of one result.
+    return 1; // A type-cast of one result.
 
   if (Operator->isSubClassOf("ComplexPattern"))
     return 1;
@@ -1943,7 +1929,7 @@ void TreePatternNode::print(raw_ostream &OS) const {
       ListSeparator LS;
       for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
         OS << LS;
-        getChild(i)->print(OS);
+        getChild(i).print(OS);
       }
     }
     OS << ")";
@@ -1963,9 +1949,7 @@ void TreePatternNode::print(raw_ostream &OS) const {
   for (const ScopedName &Name : NamesAsPredicateArg)
     OS << ":$pred:" << Name.getScope() << ":" << Name.getIdentifier();
 }
-void TreePatternNode::dump() const {
-  print(errs());
-}
+void TreePatternNode::dump() const { print(errs()); }
 
 /// isIsomorphicTo - Return true if this node is recursively
 /// isomorphic to the specified node.  For this comparison, the node's
@@ -1974,36 +1958,37 @@ void TreePatternNode::dump() const {
 /// the assigned name is present in the dependent variable set, then
 /// the assigned name is considered significant and the node is
 /// isomorphic if the names match.
-bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
+bool TreePatternNode::isIsomorphicTo(const TreePatternNode &N,
                                      const MultipleUseVarSet &DepVars) const {
-  if (N == this) return true;
-  if (N->isLeaf() != isLeaf())
+  if (&N == this)
+    return true;
+  if (N.isLeaf() != isLeaf())
     return false;
 
   // Check operator of non-leaves early since it can be cheaper than checking
   // types.
   if (!isLeaf())
-    if (N->getOperator() != getOperator() ||
-        N->getNumChildren() != getNumChildren())
+    if (N.getOperator() != getOperator() ||
+        N.getNumChildren() != getNumChildren())
       return false;
 
-  if (getExtTypes() != N->getExtTypes() ||
-      getPredicateCalls() != N->getPredicateCalls() ||
-      getTransformFn() != N->getTransformFn())
+  if (getExtTypes() != N.getExtTypes() ||
+      getPredicateCalls() != N.getPredicateCalls() ||
+      getTransformFn() != N.getTransformFn())
     return false;
 
   if (isLeaf()) {
     if (DefInit *DI = dyn_cast<DefInit>(getLeafValue())) {
-      if (DefInit *NDI = dyn_cast<DefInit>(N->getLeafValue())) {
+      if (DefInit *NDI = dyn_cast<DefInit>(N.getLeafValue())) {
         return ((DI->getDef() == NDI->getDef()) &&
-                (!DepVars.contains(getName()) || getName() == N->getName()));
+                (!DepVars.contains(getName()) || getName() == N.getName()));
       }
     }
-    return getLeafValue() == N->getLeafValue();
+    return getLeafValue() == N.getLeafValue();
   }
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (!getChild(i)->isIsomorphicTo(N->getChild(i), DepVars))
+    if (!getChild(i).isIsomorphicTo(N.getChild(i), DepVars))
       return false;
   return true;
 }
@@ -2018,7 +2003,7 @@ TreePatternNodePtr TreePatternNode::clone() const {
     std::vector<TreePatternNodePtr> CChildren;
     CChildren.reserve(Children.size());
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      CChildren.push_back(getChild(i)->clone());
+      CChildren.push_back(getChild(i).clone());
     New = makeIntrusiveRefCnt<TreePatternNode>(
         getOperator(), std::move(CChildren), getNumTypes());
   }
@@ -2035,41 +2020,42 @@ TreePatternNodePtr TreePatternNode::clone() const {
 void TreePatternNode::RemoveAllTypes() {
   // Reset to unknown type.
   std::fill(Types.begin(), Types.end(), TypeSetByHwMode());
-  if (isLeaf()) return;
+  if (isLeaf())
+    return;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    getChild(i)->RemoveAllTypes();
+    getChild(i).RemoveAllTypes();
 }
 
-
 /// SubstituteFormalArguments - Replace the formal arguments in this tree
 /// with actual values specified by ArgMap.
 void TreePatternNode::SubstituteFormalArguments(
     std::map<std::string, TreePatternNodePtr> &ArgMap) {
-  if (isLeaf()) return;
+  if (isLeaf())
+    return;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
-    TreePatternNode *Child = getChild(i);
-    if (Child->isLeaf()) {
-      Init *Val = Child->getLeafValue();
+    TreePatternNode &Child = getChild(i);
+    if (Child.isLeaf()) {
+      Init *Val = Child.getLeafValue();
       // Note that, when substituting into an output pattern, Val might be an
       // UnsetInit.
-      if (isa<UnsetInit>(Val) || (isa<DefInit>(Val) &&
-          cast<DefInit>(Val)->getDef()->getName() == "node")) {
+      if (isa<UnsetInit>(Val) ||
+          (isa<DefInit>(Val) &&
+           cast<DefInit>(Val)->getDef()->getName() == "node")) {
         // We found a use of a formal argument, replace it with its value.
-        TreePatternNodePtr NewChild = ArgMap[Child->getName()];
+        TreePatternNodePtr NewChild = ArgMap[Child.getName()];
         assert(NewChild && "Couldn't find formal argument!");
-        assert((Child->getPredicateCalls().empty() ||
-                NewChild->getPredicateCalls() == Child->getPredicateCalls()) &&
+        assert((Child.getPredicateCalls().empty() ||
+                NewChild->getPredicateCalls() == Child.getPredicateCalls()) &&
                "Non-empty child predicate clobbered!");
         setChild(i, std::move(NewChild));
       }
     } else {
-      getChild(i)->SubstituteFormalArguments(ArgMap);
+      getChild(i).SubstituteFormalArguments(ArgMap);
     }
   }
 }
 
-
 /// InlinePatternFragments - If this pattern refers to any pattern
 /// fragments, return the set of inlined versions (this can be more than
 /// one if a PatFrags record has multiple alternatives).
@@ -2205,7 +2191,7 @@ void TreePatternNode::InlinePatternFragments(
     for (const TreePredicateCall &Pred : getPredicateCalls())
       FragTree->addPredicateCall(Pred);
 
-    // The fragment we inlined could have recursive inlining that is needed.  See
+    // The fragment we inlined could have recursive inlining that is needed. See
     // if there are any pattern fragments in it and inline them as needed.
     FragTree->InlinePatternFragments(TP, OutAlternatives);
   }
@@ -2224,8 +2210,7 @@ void TreePatternNode::InlinePatternFragments(
 /// GPR:$src operand above.
 ///
 static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo,
-                                       bool NotRegisters,
-                                       bool Unnamed,
+                                       bool NotRegisters, bool Unnamed,
                                        TreePattern &TP) {
   CodeGenDAGPatterns &CDP = TP.getDAGPatterns();
 
@@ -2331,17 +2316,16 @@ static TypeSetByHwMode getImplicitType(Record *R, unsigned ResNo,
   return TypeSetByHwMode(MVT::Other);
 }
 
-
 /// getIntrinsicInfo - If this node corresponds to an intrinsic, return the
 /// CodeGenIntrinsic information for it, otherwise return a null pointer.
-const CodeGenIntrinsic *TreePatternNode::
-getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
+const CodeGenIntrinsic *
+TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
   if (getOperator() != CDP.get_intrinsic_void_sdnode() &&
       getOperator() != CDP.get_intrinsic_w_chain_sdnode() &&
       getOperator() != CDP.get_intrinsic_wo_chain_sdnode())
     return nullptr;
 
-  unsigned IID = cast<IntInit>(getChild(0)->getLeafValue())->getValue();
+  unsigned IID = cast<IntInit>(getChild(0).getLeafValue())->getValue();
   return &CDP.getIntrinsicInfo(IID);
 }
 
@@ -2406,9 +2390,6 @@ bool TreePatternNode::NodeHasProperty(SDNP Property,
   return CGP.getSDNodeInfo(getOperator()).hasProperty(Property);
 }
 
-
-
-
 /// TreeHasProperty - Return true if any node in this tree has the specified
 /// property.
 bool TreePatternNode::TreeHasProperty(SDNP Property,
@@ -2416,44 +2397,41 @@ bool TreePatternNode::TreeHasProperty(SDNP Property,
   if (NodeHasProperty(Property, CGP))
     return true;
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (getChild(i)->TreeHasProperty(Property, CGP))
+    if (getChild(i).TreeHasProperty(Property, CGP))
       return true;
   return false;
 }
 
 /// isCommutativeIntrinsic - Return true if the node corresponds to a
 /// commutative intrinsic.
-bool
-TreePatternNode::isCommutativeIntrinsic(const CodeGenDAGPatterns &CDP) const {
+bool TreePatternNode::isCommutativeIntrinsic(
+    const CodeGenDAGPatterns &CDP) const {
   if (const CodeGenIntrinsic *Int = getIntrinsicInfo(CDP))
     return Int->isCommutative;
   return false;
 }
 
-static bool isOperandClass(const TreePatternNode *N, StringRef Class) {
-  if (!N->isLeaf())
-    return N->getOperator()->isSubClassOf(Class);
+static bool isOperandClass(const TreePatternNode &N, StringRef Class) {
+  if (!N.isLeaf())
+    return N.getOperator()->isSubClassOf(Class);
 
-  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(N.getLeafValue());
   if (DI && DI->getDef()->isSubClassOf(Class))
     return true;
 
   return false;
 }
 
-static void emitTooManyOperandsError(TreePattern &TP,
-                                     StringRef InstName,
-                                     unsigned Expected,
-                                     unsigned Actual) {
+static void emitTooManyOperandsError(TreePattern &TP, StringRef InstName,
+                                     unsigned Expected, unsigned Actual) {
   TP.error("Instruction '" + InstName + "' was provided " + Twine(Actual) +
            " operands but expected only " + Twine(Expected) + "!");
 }
 
-static void emitTooFewOperandsError(TreePattern &TP,
-                                    StringRef InstName,
+static void emitTooFewOperandsError(TreePattern &TP, StringRef InstName,
                                     unsigned Actual) {
-  TP.error("Instruction '" + InstName +
-           "' expects more than the provided " + Twine(Actual) + " operands!");
+  TP.error("Instruction '" + InstName + "' expects more than the provided " +
+           Twine(Actual) + " operands!");
 }
 
 /// ApplyTypeConstraints - Apply all of the type constraints relevant to
@@ -2469,9 +2447,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // If it's a regclass or something else known, include the type.
       bool MadeChange = false;
       for (unsigned i = 0, e = Types.size(); i != e; ++i)
-        MadeChange |= UpdateNodeType(i, getImplicitType(DI->getDef(), i,
-                                                        NotRegisters,
-                                                        !hasName(), TP), TP);
+        MadeChange |= UpdateNodeType(
+            i, getImplicitType(DI->getDef(), i, NotRegisters, !hasName(), TP),
+            TP);
       return MadeChange;
     }
 
@@ -2528,15 +2506,15 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     // Apply type info to the intrinsic ID.
-    MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP);
+    MadeChange |= getChild(0).UpdateNodeType(0, MVT::iPTR, TP);
 
-    for (unsigned i = 0, e = getNumChildren()-1; i != e; ++i) {
-      MadeChange |= getChild(i+1)->ApplyTypeConstraints(TP, NotRegisters);
+    for (unsigned i = 0, e = getNumChildren() - 1; i != e; ++i) {
+      MadeChange |= getChild(i + 1).ApplyTypeConstraints(TP, NotRegisters);
 
       MVT::SimpleValueType OpVT =
           getValueType(Int->IS.ParamTys[i]->getValueAsDef("VT"));
-      assert(getChild(i + 1)->getNumTypes() == 1 && "Unhandled case");
-      MadeChange |= getChild(i + 1)->UpdateNodeType(0, OpVT, TP);
+      assert(getChild(i + 1).getNumTypes() == 1 && "Unhandled case");
+      MadeChange |= getChild(i + 1).UpdateNodeType(0, OpVT, TP);
     }
     return MadeChange;
   }
@@ -2554,22 +2532,22 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 
     bool MadeChange = false;
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
-    MadeChange |= NI.ApplyTypeConstraints(this, TP);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
+    MadeChange |= NI.ApplyTypeConstraints(*this, TP);
     return MadeChange;
   }
 
   if (getOperator()->isSubClassOf("Instruction")) {
     const DAGInstruction &Inst = CDP.getInstruction(getOperator());
     CodeGenInstruction &InstInfo =
-      CDP.getTargetInfo().getInstruction(getOperator());
+        CDP.getTargetInfo().getInstruction(getOperator());
 
     bool MadeChange = false;
 
     // Apply the result types to the node, these come from the things in the
     // (outs) list of the instruction.
-    unsigned NumResultsToAdd = std::min(InstInfo.Operands.NumDefs,
-                                        Inst.getNumResults());
+    unsigned NumResultsToAdd =
+        std::min(InstInfo.Operands.NumDefs, Inst.getNumResults());
     for (unsigned ResNo = 0; ResNo != NumResultsToAdd; ++ResNo)
       MadeChange |= UpdateNodeTypeFromInst(ResNo, Inst.getResult(ResNo), TP);
 
@@ -2581,7 +2559,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // FIXME: Generalize to multiple possible types and multiple possible
       // ImplicitDefs.
       MVT::SimpleValueType VT =
-        InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo());
+          InstInfo.HasOneImplicitDefWithKnownVT(CDP.getTargetInfo());
 
       if (VT != MVT::Other)
         MadeChange |= UpdateNodeType(ResNo, VT, TP);
@@ -2590,9 +2568,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // If this is an INSERT_SUBREG, constrain the source and destination VTs to
     // be the same.
     if (getOperator()->getName() == "INSERT_SUBREG") {
-      assert(getChild(0)->getNumTypes() == 1 && "FIXME: Unhandled");
-      MadeChange |= UpdateNodeType(0, getChild(0)->getExtType(0), TP);
-      MadeChange |= getChild(0)->UpdateNodeType(0, getExtType(0), TP);
+      assert(getChild(0).getNumTypes() == 1 && "FIXME: Unhandled");
+      MadeChange |= UpdateNodeType(0, getChild(0).getExtType(0), TP);
+      MadeChange |= getChild(0).UpdateNodeType(0, getExtType(0), TP);
     } else if (getOperator()->getName() == "REG_SEQUENCE") {
       // We need to do extra, custom typechecking for REG_SEQUENCE since it is
       // variadic.
@@ -2614,7 +2592,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       }
 
       for (unsigned I = 1; I < NChild; I += 2) {
-        TreePatternNode *SubIdxChild = getChild(I + 1);
+        TreePatternNode &SubIdxChild = getChild(I + 1);
         if (!isOperandClass(SubIdxChild, "SubRegIndex")) {
           TP.error("REG_SEQUENCE requires a SubRegIndex for operand " +
                    Twine(I + 1) + "!");
@@ -2636,7 +2614,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     // the pattern was intended to override A or skip it.
     unsigned NonOverridableOperands = NumFixedOperands;
     while (NonOverridableOperands > NumResults &&
-           CDP.operandHasDefault(InstInfo.Operands[NonOverridableOperands-1].Rec))
+           CDP.operandHasDefault(
+               InstInfo.Operands[NonOverridableOperands - 1].Rec))
       --NonOverridableOperands;
 
     unsigned ChildNo = 0;
@@ -2658,8 +2637,8 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         return false;
       }
 
-      TreePatternNode *Child = getChild(ChildNo++);
-      unsigned ChildResNo = 0;  // Instructions always use res #0 of their op.
+      TreePatternNode *Child = &getChild(ChildNo++);
+      unsigned ChildResNo = 0; // Instructions always use res #0 of their op.
 
       // If the operand has sub-operands, they may be provided by distinct
       // child patterns, so attempt to match each sub-operand separately.
@@ -2672,8 +2651,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
           if (Child->getNumMIResults(CDP) < NumArgs) {
             // Match first sub-operand against the child we already have.
             Record *SubRec = cast<DefInit>(MIOpInfo->getArg(0))->getDef();
-            MadeChange |=
-              Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
+            MadeChange |= Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
 
             // And the remaining sub-operands against subsequent children.
             for (unsigned Arg = 1; Arg < NumArgs; ++Arg) {
@@ -2682,11 +2660,11 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
                                         getNumChildren());
                 return false;
               }
-              Child = getChild(ChildNo++);
+              Child = &getChild(ChildNo++);
 
               SubRec = cast<DefInit>(MIOpInfo->getArg(Arg))->getDef();
               MadeChange |=
-                Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
+                  Child->UpdateNodeTypeFromInst(ChildResNo, SubRec, TP);
             }
             continue;
           }
@@ -2699,13 +2677,13 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     if (!InstInfo.Operands.isVariadic && ChildNo != getNumChildren()) {
-      emitTooManyOperandsError(TP, getOperator()->getName(),
-                               ChildNo, getNumChildren());
+      emitTooManyOperandsError(TP, getOperator()->getName(), ChildNo,
+                               getNumChildren());
       return false;
     }
 
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
     return MadeChange;
   }
 
@@ -2729,7 +2707,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     }
 
     for (unsigned i = 0; i < getNumChildren(); ++i)
-      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
+      MadeChange |= getChild(i).ApplyTypeConstraints(TP, NotRegisters);
 
     return MadeChange;
   }
@@ -2743,23 +2721,22 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     return false;
   }
 
-  bool MadeChange = getChild(0)->ApplyTypeConstraints(TP, NotRegisters);
+  bool MadeChange = getChild(0).ApplyTypeConstraints(TP, NotRegisters);
   return MadeChange;
 }
 
 /// OnlyOnRHSOfCommutative - Return true if this value is only allowed on the
 /// RHS of a commutative operation, not the on LHS.
-static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
-  if (!N->isLeaf() && N->getOperator()->getName() == "imm")
+static bool OnlyOnRHSOfCommutative(TreePatternNode &N) {
+  if (!N.isLeaf() && N.getOperator()->getName() == "imm")
     return true;
-  if (N->isLeaf() && isa<IntInit>(N->getLeafValue()))
+  if (N.isLeaf() && isa<IntInit>(N.getLeafValue()))
     return true;
   if (isImmAllOnesAllZerosMatch(N))
     return true;
   return false;
 }
 
-
 /// canPatternMatch - If it is impossible for this pattern to match on this
 /// target, fill in Reason and return false.  Otherwise, return true.  This is
 /// used as a sanity check for .td files (to prevent people from writing stuff
@@ -2767,10 +2744,11 @@ static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
 /// generating stuff that is useless.
 bool TreePatternNode::canPatternMatch(std::string &Reason,
                                       const CodeGenDAGPatterns &CDP) {
-  if (isLeaf()) return true;
+  if (isLeaf())
+    return true;
 
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
-    if (!getChild(i)->canPatternMatch(Reason, CDP))
+    if (!getChild(i).canPatternMatch(Reason, CDP))
       return false;
 
   // If this is an intrinsic, handle cases that would make it not match.  For
@@ -2790,11 +2768,12 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
   if (NodeInfo.hasProperty(SDNPCommutative) || isCommIntrinsic) {
     // Scan all of the operands of the node and make sure that only the last one
     // is a constant node, unless the RHS also is.
-    if (!OnlyOnRHSOfCommutative(getChild(getNumChildren()-1))) {
+    if (!OnlyOnRHSOfCommutative(getChild(getNumChildren() - 1))) {
       unsigned Skip = isCommIntrinsic ? 1 : 0; // First operand is intrinsic id.
-      for (unsigned i = Skip, e = getNumChildren()-1; i != e; ++i)
+      for (unsigned i = Skip, e = getNumChildren() - 1; i != e; ++i)
         if (OnlyOnRHSOfCommutative(getChild(i))) {
-          Reason="Immediate value must be on the RHS of commutative operators!";
+          Reason =
+              "Immediate value must be on the RHS of commutative operators!";
           return false;
         }
     }
@@ -2808,17 +2787,17 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
 //
 
 TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
-                         isInputPattern(isInput), HasError(false),
-                         Infer(*this) {
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
   for (Init *I : RawPat->getValues())
     Trees.push_back(ParseTreePattern(I, ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
-                         isInputPattern(isInput), HasError(false),
-                         Infer(*this) {
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
@@ -2839,15 +2818,15 @@ void TreePattern::error(const Twine &Msg) {
 
 void TreePattern::ComputeNamedNodes() {
   for (TreePatternNodePtr &Tree : Trees)
-    ComputeNamedNodes(Tree.get());
+    ComputeNamedNodes(*Tree);
 }
 
-void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
-  if (!N->getName().empty())
-    NamedNodes[N->getName()].push_back(N);
+void TreePattern::ComputeNamedNodes(TreePatternNode &N) {
+  if (!N.getName().empty())
+    NamedNodes[N.getName()].push_back(&N);
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    ComputeNamedNodes(N->getChild(i));
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    ComputeNamedNodes(N.getChild(i));
 }
 
 TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
@@ -2861,9 +2840,9 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     ///   (foo GPR, imm) -> (foo GPR, (imm))
     if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags"))
       return ParseTreePattern(
-        DagInit::get(DI, nullptr,
-                     std::vector<std::pair<Init*, StringInit*> >()),
-        OpName);
+          DagInit::get(DI, nullptr,
+                       std::vector<std::pair<Init *, StringInit *>>()),
+          OpName);
 
     // Input argument?
     TreePatternNodePtr Res = makeIntrusiveRefCnt<TreePatternNode>(DI, 1);
@@ -2943,8 +2922,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
       !Operator->isSubClassOf("SDNodeXForm") &&
       !Operator->isSubClassOf("Intrinsic") &&
       !Operator->isSubClassOf("ComplexPattern") &&
-      Operator->getName() != "set" &&
-      Operator->getName() != "implicit")
+      Operator->getName() != "set" && Operator->getName() != "implicit")
     error("Unrecognized node '" + Operator->getName() + "'!");
 
   //  Check to see if this is something that is illegal in an input pattern.
@@ -2956,20 +2934,16 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
     if (Operator->isSubClassOf("Intrinsic"))
       error("Cannot use '" + Operator->getName() + "' in an output pattern!");
 
-    if (Operator->isSubClassOf("SDNode") &&
-        Operator->getName() != "imm" &&
-        Operator->getName() != "timm" &&
-        Operator->getName() != "fpimm" &&
+    if (Operator->isSubClassOf("SDNode") && Operator->getName() != "imm" &&
+        Operator->getName() != "timm" && Operator->getName() != "fpimm" &&
         Operator->getName() != "tglobaltlsaddr" &&
         Operator->getName() != "tconstpool" &&
         Operator->getName() != "tjumptable" &&
         Operator->getName() != "tframeindex" &&
         Operator->getName() != "texternalsym" &&
         Operator->getName() != "tblockaddress" &&
-        Operator->getName() != "tglobaladdr" &&
-        Operator->getName() != "bb" &&
-        Operator->getName() != "vt" &&
-        Operator->getName() != "mcsym")
+        Operator->getName() != "tglobaladdr" && Operator->getName() != "bb" &&
+        Operator->getName() != "vt" && Operator->getName() != "mcsym")
       error("Cannot use '" + Operator->getName() + "' in an output pattern!");
   }
 
@@ -2979,8 +2953,8 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i)
     Children.push_back(ParseTreePattern(Dag->getArg(i), Dag->getArgNameStr(i)));
 
-  // Get the actual number of results before Operator is converted to an intrinsic
-  // node (which is hard-coded to have either zero or one result).
+  // Get the actual number of results before Operator is converted to an
+  // intrinsic node (which is hard-coded to have either zero or one result).
   unsigned NumResults = GetNumNodeResults(Operator, CDP);
 
   // If the operator is an intrinsic, then this is just syntactic sugar for
@@ -2988,7 +2962,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
   // convert the intrinsic name to a number.
   if (Operator->isSubClassOf("Intrinsic")) {
     const CodeGenIntrinsic &Int = getDAGPatterns().getIntrinsic(Operator);
-    unsigned IID = getDAGPatterns().getIntrinsicID(Operator)+1;
+    unsigned IID = getDAGPatterns().getIntrinsicID(Operator) + 1;
 
     // If this intrinsic returns void, it must have side-effects and thus a
     // chain.
@@ -3057,7 +3031,7 @@ static bool SimplifyTree(TreePatternNodePtr &N) {
   if (N->getOperator()->getName() == "bitconvert" &&
       N->getExtType(0).isValueTypeByHwMode(false) &&
       !N->getExtType(0).empty() &&
-      N->getExtType(0) == N->getChild(0)->getExtType(0) &&
+      N->getExtType(0) == N->getChild(0).getExtType(0) &&
       N->getName().empty()) {
     N = N->getChildShared(0);
     SimplifyTree(N);
@@ -3072,13 +3046,11 @@ static bool SimplifyTree(TreePatternNodePtr &N) {
   return MadeChange;
 }
 
-
-
 /// InferAllTypes - Infer/propagate as many types throughout the expression
 /// patterns as possible.  Return true if all types are inferred, false
 /// otherwise.  Flags an error if a type contradiction is found.
-bool TreePattern::
-InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
+bool TreePattern::InferAllTypes(
+    const StringMap<SmallVector<TreePatternNode *, 1>> *InNamedTypes) {
   if (NamedNodes.empty())
     ComputeNamedNodes();
 
@@ -3092,7 +3064,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
 
     // If there are constraints on our named nodes, apply them.
     for (auto &Entry : NamedNodes) {
-      SmallVectorImpl<TreePatternNode*> &Nodes = Entry.second;
+      SmallVectorImpl<TreePatternNode *> &Nodes = Entry.second;
 
       // If we have input named node types, propagate their types to the named
       // values here.
@@ -3103,8 +3075,8 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
           return true;
         }
 
-        const SmallVectorImpl<TreePatternNode*> &InNodes =
-          InNamedTypes->find(Entry.getKey())->second;
+        const SmallVectorImpl<TreePatternNode *> &InNodes =
+            InNamedTypes->find(Entry.getKey())->second;
 
         // The input types should be fully resolved by now.
         for (TreePatternNode *Node : Nodes) {
@@ -3120,19 +3092,18 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
               continue;
           }
 
-          assert(Node->getNumTypes() == 1 &&
-                 InNodes[0]->getNumTypes() == 1 &&
+          assert(Node->getNumTypes() == 1 && InNodes[0]->getNumTypes() == 1 &&
                  "FIXME: cannot name multiple result nodes yet");
-          MadeChange |= Node->UpdateNodeType(0, InNodes[0]->getExtType(0),
-                                             *this);
+          MadeChange |=
+              Node->UpdateNodeType(0, InNodes[0]->getExtType(0), *this);
         }
       }
 
       // If there are multiple nodes with the same name, they must all have the
       // same type.
       if (Entry.second.size() > 1) {
-        for (unsigned i = 0, e = Nodes.size()-1; i != e; ++i) {
-          TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i+1];
+        for (unsigned i = 0, e = Nodes.size() - 1; i != e; ++i) {
+          TreePatternNode *N1 = Nodes[i], *N2 = Nodes[i + 1];
           assert(N1->getNumTypes() == 1 && N2->getNumTypes() == 1 &&
                  "FIXME: cannot name multiple result nodes yet");
 
@@ -3190,7 +3161,7 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R,
   ParsePatternFragments();
   ParseDefaultOperands();
   ParseInstructions();
-  ParsePatternFragments(/*OutFrags*/true);
+  ParsePatternFragments(/*OutFrags*/ true);
   ParsePatterns();
 
   // Generate variants.  For example, commutative patterns can match
@@ -3221,7 +3192,7 @@ Record *CodeGenDAGPatterns::getSDNodeNamed(StringRef Name) const {
 
 // Parse all of the SDNode definitions for the target, populating SDNodes.
 void CodeGenDAGPatterns::ParseNodeInfo() {
-  std::vector<Record*> Nodes = Records.getAllDerivedDefinitions("SDNode");
+  std::vector<Record *> Nodes = Records.getAllDerivedDefinitions("SDNode");
   const CodeGenHwModes &CGH = getTargetInfo().getHwModes();
 
   while (!Nodes.empty()) {
@@ -3231,15 +3202,16 @@ void CodeGenDAGPatterns::ParseNodeInfo() {
   }
 
   // Get the builtin intrinsic nodes.
-  intrinsic_void_sdnode     = getSDNodeNamed("intrinsic_void");
-  intrinsic_w_chain_sdnode  = getSDNodeNamed("intrinsic_w_chain");
+  intrinsic_void_sdnode = getSDNodeNamed("intrinsic_void");
+  intrinsic_w_chain_sdnode = getSDNodeNamed("intrinsic_w_chain");
   intrinsic_wo_chain_sdnode = getSDNodeNamed("intrinsic_wo_chain");
 }
 
 /// ParseNodeTransforms - Parse all SDNodeXForm instances into the SDNodeXForms
 /// map, and emit them to the file as functions.
 void CodeGenDAGPatterns::ParseNodeTransforms() {
-  std::vector<Record*> Xforms = Records.getAllDerivedDefinitions("SDNodeXForm");
+  std::vector<Record *> Xforms =
+      Records.getAllDerivedDefinitions("SDNodeXForm");
   while (!Xforms.empty()) {
     Record *XFormNode = Xforms.back();
     Record *SDNode = XFormNode->getValueAsDef("Opcode");
@@ -3252,21 +3224,22 @@ void CodeGenDAGPatterns::ParseNodeTransforms() {
 }
 
 void CodeGenDAGPatterns::ParseComplexPatterns() {
-  std::vector<Record*> AMs = Records.getAllDerivedDefinitions("ComplexPattern");
+  std::vector<Record *> AMs =
+      Records.getAllDerivedDefinitions("ComplexPattern");
   while (!AMs.empty()) {
     ComplexPatterns.insert(std::make_pair(AMs.back(), AMs.back()));
     AMs.pop_back();
   }
 }
 
-
 /// ParsePatternFragments - Parse all of the PatFrag definitions in the .td
 /// file, building up the PatternFragments map.  After we've collected them all,
 /// inline fragments together as necessary, so that there are no references left
 /// inside a pattern fragment to a pattern fragment.
 ///
 void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
-  std::vector<Record*> Fragments = Records.getAllDerivedDefinitions("PatFrags");
+  std::vector<Record *> Fragments =
+      Records.getAllDerivedDefinitions("PatFrags");
 
   // First step, parse all of the fragments.
   for (Record *Frag : Fragments) {
@@ -3274,10 +3247,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
       continue;
 
     ListInit *LI = Frag->getValueAsListInit("Fragments");
-    TreePattern *P =
-        (PatternFragments[Frag] = std::make_unique<TreePattern>(
-             Frag, LI, !Frag->isSubClassOf("OutPatFrag"),
-             *this)).get();
+    TreePattern *P = (PatternFragments[Frag] = std::make_unique<TreePattern>(
+                          Frag, LI, !Frag->isSubClassOf("OutPatFrag"), *this))
+                         .get();
 
     // Validate the argument list, converting it to set, to discard duplicates.
     std::vector<std::string> &Args = P->getArgList();
@@ -3294,10 +3266,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     DefInit *OpsOp = dyn_cast<DefInit>(OpsList->getOperator());
     // Special cases: ops == outs == ins. Different names are used to
     // improve readability.
-    if (!OpsOp ||
-        (OpsOp->getDef()->getName() != "ops" &&
-         OpsOp->getDef()->getName() != "outs" &&
-         OpsOp->getDef()->getName() != "ins"))
+    if (!OpsOp || (OpsOp->getDef()->getName() != "ops" &&
+                   OpsOp->getDef()->getName() != "outs" &&
+                   OpsOp->getDef()->getName() != "ins"))
       P->error("Operands list should start with '(ops ... '!");
 
     // Copy over the arguments.
@@ -3323,7 +3294,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
     // If there is a node transformation corresponding to this, keep track of
     // it.
     Record *Transform = Frag->getValueAsDef("OperandTransform");
-    if (!getSDNodeTransform(Transform).second.empty())    // not noop xform?
+    if (!getSDNodeTransform(Transform).second.empty()) // not noop xform?
       for (const auto &T : P->getTrees())
         T->setTransformFn(Transform);
   }
@@ -3354,7 +3325,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
 }
 
 void CodeGenDAGPatterns::ParseDefaultOperands() {
-  std::vector<Record*> DefaultOps;
+  std::vector<Record *> DefaultOps;
   DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps");
 
   // Find some SDNode.
@@ -3366,10 +3337,10 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
 
     // Clone the DefaultInfo dag node, changing the operator from 'ops' to
     // SomeSDnode so that we can parse this.
-    std::vector<std::pair<Init*, StringInit*> > Ops;
+    std::vector<std::pair<Init *, StringInit *>> Ops;
     for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
-      Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
-                                   DefaultInfo->getArgName(op)));
+      Ops.push_back(
+          std::make_pair(DefaultInfo->getArg(op), DefaultInfo->getArgName(op)));
     DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops);
 
     // Create a TreePattern to parse this.
@@ -3480,11 +3451,11 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
 
   if (Pat->getOperator()->getName() == "implicit") {
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
-      TreePatternNode *Dest = Pat->getChild(i);
-      if (!Dest->isLeaf())
+      TreePatternNode &Dest = Pat->getChild(i);
+      if (!Dest.isLeaf())
         I.error("implicitly defined value should be a register!");
 
-      DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
+      DefInit *Val = dyn_cast<DefInit>(Dest.getLeafValue());
       if (!Val || !Val->getDef()->isSubClassOf("Register"))
         I.error("implicitly defined value should be a register!");
       if (Val)
@@ -3497,7 +3468,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
     // If this is not a set, verify that the children nodes are not void typed,
     // and recurse.
     for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) {
-      if (Pat->getChild(i)->getNumTypes() == 0)
+      if (Pat->getChild(i).getNumTypes() == 0)
         I.error("Cannot have void nodes inside of patterns!");
       FindPatternInputsAndOutputs(I, Pat->getChildShared(i), InstInputs,
                                   InstResults, InstImpResults);
@@ -3520,7 +3491,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
     I.error("Cannot specify a transform function on a set node!");
 
   // Check the set destinations.
-  unsigned NumDests = Pat->getNumChildren()-1;
+  unsigned NumDests = Pat->getNumChildren() - 1;
   for (unsigned i = 0; i != NumDests; ++i) {
     TreePatternNodePtr Dest = Pat->getChildShared(i);
     // For set destinations we also must resolve fragments here.
@@ -3565,6 +3536,7 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs(
 
 class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
+
 public:
   bool hasSideEffects;
   bool mayStore;
@@ -3574,63 +3546,71 @@ public:
   bool hasChain;
 
   InstAnalyzer(const CodeGenDAGPatterns &cdp)
-    : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-      isBitcast(false), isVariadic(false), hasChain(false) {}
+      : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
+        isBitcast(false), isVariadic(false), hasChain(false) {}
 
   void Analyze(const PatternToMatch &Pat) {
-    const TreePatternNode *N = Pat.getSrcPattern();
+    const TreePatternNode &N = Pat.getSrcPattern();
     AnalyzeNode(N);
     // These properties are detected only on the root node.
     isBitcast = IsNodeBitcast(N);
   }
 
 private:
-  bool IsNodeBitcast(const TreePatternNode *N) const {
+  bool IsNodeBitcast(const TreePatternNode &N) const {
     if (hasSideEffects || mayLoad || mayStore || isVariadic)
       return false;
 
-    if (N->isLeaf())
+    if (N.isLeaf())
       return false;
-    if (N->getNumChildren() != 1 || !N->getChild(0)->isLeaf())
+    if (N.getNumChildren() != 1 || !N.getChild(0).isLeaf())
       return false;
 
-    if (N->getOperator()->isSubClassOf("ComplexPattern"))
+    if (N.getOperator()->isSubClassOf("ComplexPattern"))
       return false;
 
-    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N->getOperator());
+    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N.getOperator());
     if (OpInfo.getNumResults() != 1 || OpInfo.getNumOperands() != 1)
       return false;
     return OpInfo.getEnumName() == "ISD::BITCAST";
   }
 
 public:
-  void AnalyzeNode(const TreePatternNode *N) {
-    if (N->isLeaf()) {
-      if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
+  void AnalyzeNode(const TreePatternNode &N) {
+    if (N.isLeaf()) {
+      if (DefInit *DI = dyn_cast<DefInit>(N.getLeafValue())) {
         Record *LeafRec = DI->getDef();
         // Handle ComplexPattern leaves.
         if (LeafRec->isSubClassOf("ComplexPattern")) {
           const ComplexPattern &CP = CDP.getComplexPattern(LeafRec);
-          if (CP.hasProperty(SDNPMayStore)) mayStore = true;
-          if (CP.hasProperty(SDNPMayLoad)) mayLoad = true;
-          if (CP.hasProperty(SDNPSideEffect)) hasSideEffects = true;
+          if (CP.hasProperty(SDNPMayStore))
+            mayStore = true;
+          if (CP.hasProperty(SDNPMayLoad))
+            mayLoad = true;
+          if (CP.hasProperty(SDNPSideEffect))
+            hasSideEffects = true;
         }
       }
       return;
     }
 
     // Analyze children.
-    for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-      AnalyzeNode(N->getChild(i));
+    for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+      AnalyzeNode(N.getChild(i));
 
     // Notice properties of the node.
-    if (N->NodeHasProperty(SDNPMayStore, CDP)) mayStore = true;
-    if (N->NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true;
-    if (N->NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true;
-    if (N->NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true;
-    if (N->NodeHasProperty(SDNPHasChain, CDP)) hasChain = true;
-
-    if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
+    if (N.NodeHasProperty(SDNPMayStore, CDP))
+      mayStore = true;
+    if (N.NodeHasProperty(SDNPMayLoad, CDP))
+      mayLoad = true;
+    if (N.NodeHasProperty(SDNPSideEffect, CDP))
+      hasSideEffects = true;
+    if (N.NodeHasProperty(SDNPVariadic, CDP))
+      isVariadic = true;
+    if (N.NodeHasProperty(SDNPHasChain, CDP))
+      hasChain = true;
+
+    if (const CodeGenIntrinsic *IntInfo = N.getIntrinsicInfo(CDP)) {
       ModRefInfo MR = IntInfo->ME.getModRef();
       // If this is an intrinsic, analyze it.
       if (isRefSet(MR))
@@ -3645,17 +3625,15 @@ public:
         hasSideEffects = true;
     }
   }
-
 };
 
 static bool InferFromPattern(CodeGenInstruction &InstInfo,
-                             const InstAnalyzer &PatInfo,
-                             Record *PatDef) {
+                             const InstAnalyzer &PatInfo, Record *PatDef) {
   bool Error = false;
 
   // Remember where InstInfo got its flags.
   if (InstInfo.hasUndefFlags())
-      InstInfo.InferredFrom = PatDef;
+    InstInfo.InferredFrom = PatDef;
 
   // Check explicitly set flags for consistency.
   if (InstInfo.hasSideEffects != PatInfo.hasSideEffects &&
@@ -3666,14 +3644,14 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
     if (!InstInfo.hasSideEffects) {
       Error = true;
       PrintError(PatDef->getLoc(), "Pattern doesn't match hasSideEffects = " +
-                 Twine(InstInfo.hasSideEffects));
+                                       Twine(InstInfo.hasSideEffects));
     }
   }
 
   if (InstInfo.mayStore != PatInfo.mayStore && !InstInfo.mayStore_Unset) {
     Error = true;
-    PrintError(PatDef->getLoc(), "Pattern doesn't match mayStore = " +
-               Twine(InstInfo.mayStore));
+    PrintError(PatDef->getLoc(),
+               "Pattern doesn't match mayStore = " + Twine(InstInfo.mayStore));
   }
 
   if (InstInfo.mayLoad != PatInfo.mayLoad && !InstInfo.mayLoad_Unset) {
@@ -3681,8 +3659,8 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
     // Some targets translate immediates to loads.
     if (!InstInfo.mayLoad) {
       Error = true;
-      PrintError(PatDef->getLoc(), "Pattern doesn't match mayLoad = " +
-                 Twine(InstInfo.mayLoad));
+      PrintError(PatDef->getLoc(),
+                 "Pattern doesn't match mayLoad = " + Twine(InstInfo.mayLoad));
     }
   }
 
@@ -3712,11 +3690,13 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo,
 /// null_frag operator.
 static bool hasNullFragReference(DagInit *DI) {
   DefInit *OpDef = dyn_cast<DefInit>(DI->getOperator());
-  if (!OpDef) return false;
+  if (!OpDef)
+    return false;
   Record *Operator = OpDef->getDef();
 
   // If this is the null fragment, return true.
-  if (Operator->getName() == "null_frag") return true;
+  if (Operator->getName() == "null_frag")
+    return true;
   // If any of the arguments reference the null fragment, return true.
   for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
     if (auto Arg = dyn_cast<DefInit>(DI->getArg(i)))
@@ -3743,20 +3723,19 @@ static bool hasNullFragReference(ListInit *LI) {
 }
 
 /// Get all the instructions in a tree.
-static void
-getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl<Record*> &Instrs) {
-  if (Tree->isLeaf())
+static void getInstructionsInTree(TreePatternNode &Tree,
+                                  SmallVectorImpl<Record *> &Instrs) {
+  if (Tree.isLeaf())
     return;
-  if (Tree->getOperator()->isSubClassOf("Instruction"))
-    Instrs.push_back(Tree->getOperator());
-  for (unsigned i = 0, e = Tree->getNumChildren(); i != e; ++i)
-    getInstructionsInTree(Tree->getChild(i), Instrs);
+  if (Tree.getOperator()->isSubClassOf("Instruction"))
+    Instrs.push_back(Tree.getOperator());
+  for (unsigned i = 0, e = Tree.getNumChildren(); i != e; ++i)
+    getInstructionsInTree(Tree.getChild(i), Instrs);
 }
 
 /// Check the class of a pattern leaf node against the instruction operand it
 /// represents.
-static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
-                              Record *Leaf) {
+static bool checkOperandClass(CGIOperandList::OperandInfo &OI, Record *Leaf) {
   if (OI.Rec == Leaf)
     return true;
 
@@ -3772,8 +3751,9 @@ static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
   return false;
 }
 
-void CodeGenDAGPatterns::parseInstructionPattern(
-    CodeGenInstruction &CGI, ListInit *Pat, DAGInstMap &DAGInsts) {
+void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
+                                                 ListInit *Pat,
+                                                 DAGInstMap &DAGInsts) {
 
   assert(!DAGInsts.count(CGI.TheDef) && "Instruction already parsed!");
 
@@ -3789,7 +3769,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
       InstResults;
 
-  std::vector<Record*> InstImpResults;
+  std::vector<Record *> InstImpResults;
 
   // Verify that the top-level forms in the instruction are of void type, and
   // fill in the InstResults map.
@@ -3805,8 +3785,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
         Pat->getExtType(k).writeToStream(OS);
       }
       I.error("Top-level forms in instruction pattern should have"
-               " void types, has types " +
-               OS.str());
+              " void types, has types " +
+              OS.str());
     }
 
     // Find inputs and outputs, and verify the structure of the uses/defs.
@@ -3823,7 +3803,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   assert(I.getArgList().empty() && "Args list should still be empty here!");
 
   // Check that all of the results occur first in the list.
-  std::vector<Record*> Results;
+  std::vector<Record *> Results;
   std::vector<unsigned> ResultIndices;
   SmallVector<TreePatternNodePtr, 2> ResNodes;
   for (unsigned i = 0; i != NumResults; ++i) {
@@ -3850,8 +3830,9 @@ void CodeGenDAGPatterns::parseInstructionPattern(
     Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
     ResNodes.push_back(std::move(RNode));
     if (!R)
-      I.error("Operand $" + OpName + " should be a set destination: all "
-               "outputs must occur before inputs in operand list!");
+      I.error("Operand $" + OpName +
+              " should be a set destination: all "
+              "outputs must occur before inputs in operand list!");
 
     if (!checkOperandClass(CGI.Operands[i], R))
       I.error("Operand $" + OpName + " class mismatch!");
@@ -3868,7 +3849,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 
   // Loop over the inputs next.
   std::vector<TreePatternNodePtr> ResultNodeOperands;
-  std::vector<Record*> Operands;
+  std::vector<Record *> Operands;
   for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) {
     CGIOperandList::OperandInfo &Op = CGI.Operands[i];
     const std::string &OpName = Op.Name;
@@ -3885,16 +3866,17 @@ void CodeGenDAGPatterns::parseInstructionPattern(
           continue;
       }
       I.error("Operand $" + OpName +
-               " does not appear in the instruction pattern");
+              " does not appear in the instruction pattern");
     }
     TreePatternNodePtr InVal = InstInputs[OpName];
-    InstInputs.erase(OpName);   // It occurred, remove from map.
+    InstInputs.erase(OpName); // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();
       if (!checkOperandClass(Op, InRec))
-        I.error("Operand $" + OpName + "'s register class disagrees"
-                 " between the operand and pattern");
+        I.error("Operand $" + OpName +
+                "'s register class disagrees"
+                " between the operand and pattern");
     }
     Operands.push_back(Op.Rec);
 
@@ -3910,7 +3892,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
       std::vector<TreePatternNodePtr> Children;
       Children.push_back(OpNode);
       OpNode = makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
-                                                 OpNode->getNumTypes());
+                                                    OpNode->getNumTypes());
     }
 
     ResultNodeOperands.push_back(std::move(OpNode));
@@ -3935,8 +3917,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
   TreePatternNodePtr Pattern = I.getTree(0);
   TreePatternNodePtr SrcPattern;
   if (Pattern->getOperator()->getName() == "set") {
-    SrcPattern = Pattern->getChild(Pattern->getNumChildren()-1)->clone();
-  } else{
+    SrcPattern = Pattern->getChild(Pattern->getNumChildren() - 1).clone();
+  } else {
     // Not a set (store or something?)
     SrcPattern = Pattern;
   }
@@ -3954,7 +3936,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(
 /// any fragments involved.  This populates the Instructions list with fully
 /// resolved instructions.
 void CodeGenDAGPatterns::ParseInstructions() {
-  std::vector<Record*> Instrs = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Instrs =
+      Records.getAllDerivedDefinitions("Instruction");
 
   for (Record *Instr : Instrs) {
     ListInit *LI = nullptr;
@@ -3969,8 +3952,8 @@ void CodeGenDAGPatterns::ParseInstructions() {
     // is from a multiclass expansion w/ a SDPatternOperator passed in as
     // null_frag.
     if (!LI || LI->empty() || hasNullFragReference(LI)) {
-      std::vector<Record*> Results;
-      std::vector<Record*> Operands;
+      std::vector<Record *> Results;
+      std::vector<Record *> Operands;
 
       CodeGenInstruction &InstInfo = Target.getInstruction(Instr);
 
@@ -3980,14 +3963,15 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
         // The rest are inputs.
         for (unsigned j = InstInfo.Operands.NumDefs,
-               e = InstInfo.Operands.size(); j < e; ++j)
+                      e = InstInfo.Operands.size();
+             j < e; ++j)
           Operands.push_back(InstInfo.Operands[j].Rec);
       }
 
       // Create and insert the instruction.
       Instructions.try_emplace(Instr, std::move(Results), std::move(Operands),
                                std::vector<Record *>());
-      continue;  // no pattern.
+      continue; // no pattern.
     }
 
     CodeGenInstruction &CGI = Target.getInstruction(Instr);
@@ -4011,22 +3995,22 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
 typedef std::pair<TreePatternNode *, unsigned> NameRecord;
 
-static void FindNames(TreePatternNode *P,
+static void FindNames(TreePatternNode &P,
                       std::map<std::string, NameRecord> &Names,
                       TreePattern *PatternTop) {
-  if (!P->getName().empty()) {
-    NameRecord &Rec = Names[P->getName()];
+  if (!P.getName().empty()) {
+    NameRecord &Rec = Names[P.getName()];
     // If this is the first instance of the name, remember the node.
     if (Rec.second++ == 0)
-      Rec.first = P;
-    else if (Rec.first->getExtTypes() != P->getExtTypes())
-      PatternTop->error("repetition of value: $" + P->getName() +
+      Rec.first = &P;
+    else if (Rec.first->getExtTypes() != P.getExtTypes())
+      PatternTop->error("repetition of value: $" + P.getName() +
                         " where different uses have different types!");
   }
 
-  if (!P->isLeaf()) {
-    for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-      FindNames(P->getChild(i), Names, PatternTop);
+  if (!P.isLeaf()) {
+    for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+      FindNames(P.getChild(i), Names, PatternTop);
   }
 }
 
@@ -4034,21 +4018,20 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
                                            PatternToMatch &&PTM) {
   // Do some sanity checking on the pattern we're about to match.
   std::string Reason;
-  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) {
+  if (!PTM.getSrcPattern().canPatternMatch(Reason, *this)) {
     PrintWarning(Pattern->getRecord()->getLoc(),
-      Twine("Pattern can never match: ") + Reason);
+                 Twine("Pattern can never match: ") + Reason);
     return;
   }
 
   // If the source pattern's root is a complex pattern, that complex pattern
   // must specify the nodes it can potentially match.
   if (const ComplexPattern *CP =
-        PTM.getSrcPattern()->getComplexPatternInfo(*this))
+          PTM.getSrcPattern().getComplexPatternInfo(*this))
     if (CP->getRootNodes().empty())
       Pattern->error("ComplexPattern at root must specify list of opcodes it"
                      " could match");
 
-
   // Find all of the named values in the input and output, ensure they have the
   // same type.
   std::map<std::string, NameRecord> SrcNames, DstNames;
@@ -4074,8 +4057,8 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
 }
 
 void CodeGenDAGPatterns::InferInstructionFlags() {
-  ArrayRef<const CodeGenInstruction*> Instructions =
-    Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> Instructions =
+      Target.getInstructionsByEnumValue();
 
   unsigned Errors = 0;
 
@@ -4085,7 +4068,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   for (const PatternToMatch &PTM : ptms()) {
     // We can only infer from single-instruction patterns, otherwise we won't
     // know which instruction should get the flags.
-    SmallVector<Record*, 8> PatInstrs;
+    SmallVector<Record *, 8> PatInstrs;
     getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
     if (PatInstrs.size() != 1)
       continue;
@@ -4109,7 +4092,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   if (Target.guessInstructionProperties()) {
     for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
       CodeGenInstruction *InstInfo =
-        const_cast<CodeGenInstruction *>(Instructions[i]);
+          const_cast<CodeGenInstruction *>(Instructions[i]);
       if (InstInfo->InferredFrom)
         continue;
       // The mayLoad and mayStore flags default to false.
@@ -4123,7 +4106,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   // Complain about any flags that are still undefined.
   for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
     CodeGenInstruction *InstInfo =
-      const_cast<CodeGenInstruction *>(Instructions[i]);
+        const_cast<CodeGenInstruction *>(Instructions[i]);
     if (InstInfo->InferredFrom)
       continue;
     if (InstInfo->hasSideEffects_Unset)
@@ -4138,12 +4121,11 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
   }
 }
 
-
 /// Verify instruction flags against pattern node properties.
 void CodeGenDAGPatterns::VerifyInstructionFlags() {
   unsigned Errors = 0;
   for (const PatternToMatch &PTM : ptms()) {
-    SmallVector<Record*, 8> Instrs;
+    SmallVector<Record *, 8> Instrs;
     getInstructionsInTree(PTM.getDstPattern(), Instrs);
     if (Instrs.empty())
       continue;
@@ -4186,16 +4168,16 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() {
     ++Errors;
 
     for (const std::string &Msg : Msgs)
-      PrintError(PTM.getSrcRecord()->getLoc(), Twine(Msg) + " on the " +
-                 (Instrs.size() == 1 ?
-                  "instruction" : "output instructions"));
+      PrintError(
+          PTM.getSrcRecord()->getLoc(),
+          Twine(Msg) + " on the " +
+              (Instrs.size() == 1 ? "instruction" : "output instructions"));
     // Provide the location of the relevant instruction definitions.
     for (const Record *Instr : Instrs) {
       if (Instr != PTM.getSrcRecord())
         PrintError(Instr->getLoc(), "defined here");
       const CodeGenInstruction &InstInfo = Target.getInstruction(Instr);
-      if (InstInfo.InferredFrom &&
-          InstInfo.InferredFrom != InstInfo.TheDef &&
+      if (InstInfo.InferredFrom && InstInfo.InferredFrom != InstInfo.TheDef &&
           InstInfo.InferredFrom != PTM.getSrcRecord())
         PrintError(InstInfo.InferredFrom->getLoc(), "inferred from pattern");
     }
@@ -4207,27 +4189,27 @@ void CodeGenDAGPatterns::VerifyInstructionFlags() {
 /// Given a pattern result with an unresolved type, see if we can find one
 /// instruction with an unresolved result type.  Force this result type to an
 /// arbitrary element if it's possible types to converge results.
-static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
-  if (N->isLeaf())
+static bool ForceArbitraryInstResultType(TreePatternNode &N, TreePattern &TP) {
+  if (N.isLeaf())
     return false;
 
   // Analyze children.
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    if (ForceArbitraryInstResultType(N->getChild(i), TP))
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    if (ForceArbitraryInstResultType(N.getChild(i), TP))
       return true;
 
-  if (!N->getOperator()->isSubClassOf("Instruction"))
+  if (!N.getOperator()->isSubClassOf("Instruction"))
     return false;
 
   // If this type is already concrete or completely unknown we can't do
   // anything.
   TypeInfer &TI = TP.getInfer();
-  for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i) {
-    if (N->getExtType(i).empty() || TI.isConcrete(N->getExtType(i), false))
+  for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i) {
+    if (N.getExtType(i).empty() || TI.isConcrete(N.getExtType(i), false))
       continue;
 
     // Otherwise, force its type to an arbitrary choice.
-    if (TI.forceArbitrary(N->getExtType(i)))
+    if (TI.forceArbitrary(N.getExtType(i)))
       return true;
   }
 
@@ -4237,11 +4219,11 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
 // Promote xform function to be an explicit node wherever set.
 static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
   if (Record *Xform = N->getTransformFn()) {
-      N->setTransformFn(nullptr);
-      std::vector<TreePatternNodePtr> Children;
-      Children.push_back(PromoteXForms(N));
-      return makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
-                                               N->getNumTypes());
+    N->setTransformFn(nullptr);
+    std::vector<TreePatternNodePtr> Children;
+    Children.push_back(PromoteXForms(N));
+    return makeIntrusiveRefCnt<TreePatternNode>(Xform, std::move(Children),
+                                                N->getNumTypes());
   }
 
   if (!N->isLeaf())
@@ -4252,9 +4234,9 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
   return N;
 }
 
-void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
-       TreePattern &Pattern, TreePattern &Result,
-       const std::vector<Record *> &InstImpResults) {
+void CodeGenDAGPatterns::ParseOnePattern(
+    Record *TheDef, TreePattern &Pattern, TreePattern &Result,
+    const std::vector<Record *> &InstImpResults) {
 
   // Inline pattern fragments and expand multiple alternatives.
   Pattern.InlinePatternFragments();
@@ -4274,8 +4256,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
 
     // Infer as many types as possible.  If we cannot infer all of them, we
     // can never do anything with this pattern: report it to the user.
-    InferredAllResultTypes =
-        Result.InferAllTypes(&Pattern.getNamedNodesMap());
+    InferredAllResultTypes = Result.InferAllTypes(&Pattern.getNamedNodesMap());
 
     IterateInference = false;
 
@@ -4286,11 +4267,11 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
     for (const auto &T : Pattern.getTrees())
       for (unsigned i = 0, e = std::min(Result.getOnlyTree()->getNumTypes(),
                                         T->getNumTypes());
-         i != e; ++i) {
-        IterateInference |= T->UpdateNodeType(
-            i, Result.getOnlyTree()->getExtType(i), Result);
-        IterateInference |= Result.getOnlyTree()->UpdateNodeType(
-            i, T->getExtType(i), Result);
+           i != e; ++i) {
+        IterateInference |=
+            T->UpdateNodeType(i, Result.getOnlyTree()->getExtType(i), Result);
+        IterateInference |=
+            Result.getOnlyTree()->UpdateNodeType(i, T->getExtType(i), Result);
       }
 
     // If our iteration has converged and the input pattern's types are fully
@@ -4302,10 +4283,9 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
     //
     // In any case, to handle this, we just go through and disambiguate some
     // arbitrary types to the result pattern's nodes.
-    if (!IterateInference && InferredAllPatternTypes &&
-        !InferredAllResultTypes)
+    if (!IterateInference && InferredAllPatternTypes && !InferredAllResultTypes)
       IterateInference =
-          ForceArbitraryInstResultType(Result.getTree(0).get(), Result);
+          ForceArbitraryInstResultType(*Result.getTree(0), Result);
   } while (IterateInference);
 
   // Verify that we inferred enough types that we can do something with the
@@ -4357,7 +4337,7 @@ void CodeGenDAGPatterns::ParseOnePattern(Record *TheDef,
 }
 
 void CodeGenDAGPatterns::ParsePatterns() {
-  std::vector<Record*> Patterns = Records.getAllDerivedDefinitions("Pattern");
+  std::vector<Record *> Patterns = Records.getAllDerivedDefinitions("Pattern");
 
   for (Record *CurPattern : Patterns) {
     DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch");
@@ -4369,7 +4349,8 @@ void CodeGenDAGPatterns::ParsePatterns() {
     TreePattern Pattern(CurPattern, Tree, true, *this);
 
     ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs");
-    if (LI->empty()) continue;  // no pattern.
+    if (LI->empty())
+      continue; // no pattern.
 
     // Parse the instruction.
     TreePattern Result(CurPattern, LI, false, *this);
@@ -4382,7 +4363,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
     std::map<std::string, TreePatternNodePtr> InstInputs;
     MapVector<std::string, TreePatternNodePtr, std::map<std::string, unsigned>>
         InstResults;
-    std::vector<Record*> InstImpResults;
+    std::vector<Record *> InstImpResults;
     for (unsigned j = 0, ee = Pattern.getNumTrees(); j != ee; ++j)
       FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs,
                                   InstResults, InstImpResults);
@@ -4391,13 +4372,13 @@ void CodeGenDAGPatterns::ParsePatterns() {
   }
 }
 
-static void collectModes(std::set<unsigned> &Modes, const TreePatternNode *N) {
-  for (const TypeSetByHwMode &VTS : N->getExtTypes())
+static void collectModes(std::set<unsigned> &Modes, const TreePatternNode &N) {
+  for (const TypeSetByHwMode &VTS : N.getExtTypes())
     for (const auto &I : VTS)
       Modes.insert(I.first);
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    collectModes(Modes, N->getChild(i));
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    collectModes(Modes, N.getChild(i));
 }
 
 void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
@@ -4410,8 +4391,8 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
   auto AppendPattern = [this](PatternToMatch &P, unsigned Mode,
                               StringRef Check) {
-    TreePatternNodePtr NewSrc = P.getSrcPattern()->clone();
-    TreePatternNodePtr NewDst = P.getDstPattern()->clone();
+    TreePatternNodePtr NewSrc = P.getSrcPattern().clone();
+    TreePatternNodePtr NewDst = P.getDstPattern().clone();
     if (!NewSrc->setDefaultMode(Mode) || !NewDst->setDefaultMode(Mode)) {
       return;
     }
@@ -4424,10 +4405,10 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
   for (PatternToMatch &P : Copy) {
     const TreePatternNode *SrcP = nullptr, *DstP = nullptr;
-    if (P.getSrcPattern()->hasProperTypeByHwMode())
-      SrcP = P.getSrcPattern();
-    if (P.getDstPattern()->hasProperTypeByHwMode())
-      DstP = P.getDstPattern();
+    if (P.getSrcPattern().hasProperTypeByHwMode())
+      SrcP = &P.getSrcPattern();
+    if (P.getDstPattern().hasProperTypeByHwMode())
+      DstP = &P.getDstPattern();
     if (!SrcP && !DstP) {
       PatternsToMatch.push_back(P);
       continue;
@@ -4435,9 +4416,9 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 
     std::set<unsigned> Modes;
     if (SrcP)
-      collectModes(Modes, SrcP);
+      collectModes(Modes, *SrcP);
     if (DstP)
-      collectModes(Modes, DstP);
+      collectModes(Modes, *DstP);
 
     // The predicate for the default mode needs to be constructed for each
     // pattern separately.
@@ -4477,18 +4458,18 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 /// Dependent variable map for CodeGenDAGPattern variant generation
 typedef StringMap<int> DepVarMap;
 
-static void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) {
-  if (N->isLeaf()) {
-    if (N->hasName() && isa<DefInit>(N->getLeafValue()))
-      DepMap[N->getName()]++;
+static void FindDepVarsOf(TreePatternNode &N, DepVarMap &DepMap) {
+  if (N.isLeaf()) {
+    if (N.hasName() && isa<DefInit>(N.getLeafValue()))
+      DepMap[N.getName()]++;
   } else {
-    for (size_t i = 0, e = N->getNumChildren(); i != e; ++i)
-      FindDepVarsOf(N->getChild(i), DepMap);
+    for (size_t i = 0, e = N.getNumChildren(); i != e; ++i)
+      FindDepVarsOf(N.getChild(i), DepMap);
   }
 }
 
 /// Find dependent variables within child patterns
-static void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) {
+static void FindDepVars(TreePatternNode &N, MultipleUseVarSet &DepVars) {
   DepVarMap depcounts;
   FindDepVarsOf(N, depcounts);
   for (const auto &Pair : depcounts) {
@@ -4512,7 +4493,6 @@ static void DumpDepVars(MultipleUseVarSet &DepVars) {
 }
 #endif
 
-
 /// CombineChildVariants - Given a bunch of permutations of each child of the
 /// 'operator' node, put them together in all possible ways.
 static void CombineChildVariants(
@@ -4563,7 +4543,7 @@ static void CombineChildVariants(
     // which are the same pattern.  Ignore the dups.
     if (R->canPatternMatch(ErrString, CDP) &&
         none_of(OutVariants, [&](TreePatternNodePtr Variant) {
-          return R->isIsomorphicTo(Variant.get(), DepVars);
+          return R->isIsomorphicTo(*Variant, DepVars);
         }))
       OutVariants.push_back(R);
 
@@ -4598,7 +4578,8 @@ static void CombineChildVariants(TreePatternNodePtr Orig,
 static void
 GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
                                   std::vector<TreePatternNodePtr> &Children) {
-  assert(N->getNumChildren()==2 &&"Associative but doesn't have 2 children!");
+  assert(N->getNumChildren() == 2 &&
+         "Associative but doesn't have 2 children!");
   Record *Operator = N->getOperator();
 
   // Only permit raw nodes.
@@ -4608,12 +4589,12 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
     return;
   }
 
-  if (N->getChild(0)->isLeaf() || N->getChild(0)->getOperator() != Operator)
+  if (N->getChild(0).isLeaf() || N->getChild(0).getOperator() != Operator)
     Children.push_back(N->getChildShared(0));
   else
     GatherChildrenOfAssociativeOpcode(N->getChildShared(0), Children);
 
-  if (N->getChild(1)->isLeaf() || N->getChild(1)->getOperator() != Operator)
+  if (N->getChild(1).isLeaf() || N->getChild(1).getOperator() != Operator)
     Children.push_back(N->getChildShared(1));
   else
     GatherChildrenOfAssociativeOpcode(N->getChildShared(1), Children);
@@ -4707,9 +4688,9 @@ static void GenerateVariantsOf(TreePatternNodePtr N,
     unsigned i = 0 + Skip;
     unsigned e = 2 + Skip;
     for (; i != e; ++i) {
-      TreePatternNode *Child = N->getChild(i);
-      if (Child->isLeaf())
-        if (DefInit *DI = dyn_cast<DefInit>(Child->getLeafValue())) {
+      TreePatternNode &Child = N->getChild(i);
+      if (Child.isLeaf())
+        if (DefInit *DI = dyn_cast<DefInit>(Child.getLeafValue())) {
           Record *RR = DI->getDef();
           if (RR->isSubClassOf("Register"))
             NoRegisters = false;
@@ -4725,7 +4706,6 @@ static void GenerateVariantsOf(TreePatternNodePtr N,
   }
 }
 
-
 // GenerateVariants - Generate variants.  For example, commutative patterns can
 // match multiple ways.  Add them to PatternsToMatch as well.
 void CodeGenDAGPatterns::GenerateVariants() {
@@ -4758,7 +4738,7 @@ void CodeGenDAGPatterns::GenerateVariants() {
       continue;
 
     LLVM_DEBUG(errs() << "FOUND VARIANTS OF: ";
-               PatternsToMatch[i].getSrcPattern()->dump(); errs() << "\n");
+               PatternsToMatch[i].getSrcPattern().dump(); errs() << "\n");
 
     for (unsigned v = 0, e = Variants.size(); v != e; ++v) {
       TreePatternNodePtr Variant = Variants[v];
@@ -4782,7 +4762,8 @@ void CodeGenDAGPatterns::GenerateVariants() {
         }
       }
       // If we already have it, ignore the variant.
-      if (AlreadyExists) continue;
+      if (AlreadyExists)
+        continue;
 
       // Otherwise, add it to the list of patterns we have.
       PatternsToMatch.emplace_back(
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index 2611fe06f55c..823c40c922cb 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -56,17 +56,15 @@ struct MachineValueTypeSet {
   static_assert(std::is_same<std::underlying_type_t<MVT::SimpleValueType>,
                              uint8_t>::value,
                 "Change uint8_t here to the SimpleValueType's type");
-  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max()+1;
+  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max() + 1;
   using WordType = uint64_t;
-  static unsigned constexpr WordWidth = CHAR_BIT*sizeof(WordType);
-  static unsigned constexpr NumWords = Capacity/WordWidth;
-  static_assert(NumWords*WordWidth == Capacity,
+  static unsigned constexpr WordWidth = CHAR_BIT * sizeof(WordType);
+  static unsigned constexpr NumWords = Capacity / WordWidth;
+  static_assert(NumWords * WordWidth == Capacity,
                 "Capacity should be a multiple of WordWidth");
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  MachineValueTypeSet() {
-    clear();
-  }
+  MachineValueTypeSet() { clear(); }
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   unsigned size() const {
@@ -76,9 +74,7 @@ struct MachineValueTypeSet {
     return Count;
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  void clear() {
-    std::memset(Words.data(), 0, NumWords*sizeof(WordType));
-  }
+  void clear() { std::memset(Words.data(), 0, NumWords * sizeof(WordType)); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool empty() const {
     for (WordType W : Words)
@@ -90,7 +86,7 @@ struct MachineValueTypeSet {
   unsigned count(MVT T) const {
     return (Words[T.SimpleTy / WordWidth] >> (T.SimpleTy % WordWidth)) & 1;
   }
-  std::pair<MachineValueTypeSet&,bool> insert(MVT T) {
+  std::pair<MachineValueTypeSet &, bool> insert(MVT T) {
     bool V = count(T.SimpleTy);
     Words[T.SimpleTy / WordWidth] |= WordType(1) << (T.SimpleTy % WordWidth);
     return {*this, V};
@@ -113,8 +109,8 @@ struct MachineValueTypeSet {
     using iterator_category = std::forward_iterator_tag;
     using value_type = MVT;
     using difference_type = ptrdiff_t;
-    using pointer = const MVT*;
-    using reference = const MVT&;
+    using pointer = const MVT *;
+    using reference = const MVT &;
 
     LLVM_ATTRIBUTE_ALWAYS_INLINE
     MVT operator*() const {
@@ -128,7 +124,7 @@ struct MachineValueTypeSet {
     LLVM_ATTRIBUTE_ALWAYS_INLINE
     const_iterator &operator++() {
       assert(Pos != Capacity);
-      Pos = find_from_pos(Pos+1);
+      Pos = find_from_pos(Pos + 1);
       return *this;
     }
 
@@ -137,9 +133,7 @@ struct MachineValueTypeSet {
       return Set == It.Set && Pos == It.Pos;
     }
     LLVM_ATTRIBUTE_ALWAYS_INLINE
-    bool operator!=(const const_iterator &It) const {
-      return !operator==(It);
-    }
+    bool operator!=(const const_iterator &It) const { return !operator==(It); }
 
   private:
     unsigned find_from_pos(unsigned P) const {
@@ -151,7 +145,7 @@ struct MachineValueTypeSet {
       // the trailing bits need to be masked off to use findFirstSet.
       if (SkipBits != 0) {
         WordType W = Set->Words[SkipWords];
-        W &= maskLeadingOnes<WordType>(WordWidth-SkipBits);
+        W &= maskLeadingOnes<WordType>(WordWidth - SkipBits);
         if (W != 0)
           return Count + llvm::countr_zero(W);
         Count += WordWidth;
@@ -174,20 +168,18 @@ struct MachineValueTypeSet {
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator begin() const { return const_iterator(this, false); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  const_iterator end()   const { return const_iterator(this, true); }
+  const_iterator end() const { return const_iterator(this, true); }
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool operator==(const MachineValueTypeSet &S) const {
     return Words == S.Words;
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  bool operator!=(const MachineValueTypeSet &S) const {
-    return !operator==(S);
-  }
+  bool operator!=(const MachineValueTypeSet &S) const { return !operator==(S); }
 
 private:
   friend struct const_iterator;
-  std::array<WordType,NumWords> Words;
+  std::array<WordType, NumWords> Words;
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T);
@@ -200,14 +192,12 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   TypeSetByHwMode(const TypeSetByHwMode &VTS) = default;
   TypeSetByHwMode &operator=(const TypeSetByHwMode &) = default;
   TypeSetByHwMode(MVT::SimpleValueType VT)
-    : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
+      : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
   TypeSetByHwMode(ValueTypeByHwMode VT)
-    : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
+      : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
   TypeSetByHwMode(ArrayRef<ValueTypeByHwMode> VTList);
 
-  SetType &getOrCreate(unsigned Mode) {
-    return Map[Mode];
-  }
+  SetType &getOrCreate(unsigned Mode) { return Map[Mode]; }
 
   bool isValueTypeByHwMode(bool AllowEmpty) const;
   ValueTypeByHwMode getValueTypeByHwMode() const;
@@ -225,9 +215,7 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
 
   bool isPossible() const;
 
-  bool isPointer() const {
-    return getValueTypeByHwMode().isPointer();
-  }
+  bool isPointer() const { return getValueTypeByHwMode().isPointer(); }
 
   unsigned getPtrAddrSpace() const {
     assert(isPointer());
@@ -313,8 +301,7 @@ struct TypeInfer {
   /// Ensure that for each type T in \p Sub, T is a vector type, and there
   /// exists a type U in \p Vec such that U is a vector type with the same
   /// element type as T and at least as many elements as T.
-  bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec,
-                                    TypeSetByHwMode &Sub);
+  bool EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec, TypeSetByHwMode &Sub);
   /// 1. Ensure that \p V has a scalar type iff \p W has a scalar type.
   /// 2. Ensure that for each vector type T in \p V, there exists a vector
   ///    type U in \p W, such that T and U have the same number of elements.
@@ -346,15 +333,13 @@ struct TypeInfer {
     SuppressValidation(TypeInfer &TI) : Infer(TI), SavedValidate(TI.Validate) {
       Infer.Validate = false;
     }
-    ~SuppressValidation() {
-      Infer.Validate = SavedValidate;
-    }
+    ~SuppressValidation() { Infer.Validate = SavedValidate; }
     TypeInfer &Infer;
     bool SavedValidate;
   };
 
   TreePattern &TP;
-  bool Validate = true;   // Indicate whether to validate types.
+  bool Validate = true; // Indicate whether to validate types.
 
 private:
   const TypeSetByHwMode &getLegalTypes() const;
@@ -372,14 +357,24 @@ typedef StringSet<> MultipleUseVarSet;
 struct SDTypeConstraint {
   SDTypeConstraint(Record *R, const CodeGenHwModes &CGH);
 
-  unsigned OperandNo;   // The operand # this constraint applies to.
+  unsigned OperandNo; // The operand # this constraint applies to.
   enum {
-    SDTCisVT, SDTCisPtrTy, SDTCisInt, SDTCisFP, SDTCisVec, SDTCisSameAs,
-    SDTCisVTSmallerThanOp, SDTCisOpSmallerThanOp, SDTCisEltOfVec,
-    SDTCisSubVecOfVec, SDTCVecEltisVT, SDTCisSameNumEltsAs, SDTCisSameSizeAs
+    SDTCisVT,
+    SDTCisPtrTy,
+    SDTCisInt,
+    SDTCisFP,
+    SDTCisVec,
+    SDTCisSameAs,
+    SDTCisVTSmallerThanOp,
+    SDTCisOpSmallerThanOp,
+    SDTCisEltOfVec,
+    SDTCisSubVecOfVec,
+    SDTCVecEltisVT,
+    SDTCisSameNumEltsAs,
+    SDTCisSameSizeAs
   } ConstraintType;
 
-  union {   // The discriminated union.
+  union { // The discriminated union.
     struct {
       unsigned OtherOperandNum;
     } SDTCisSameAs_Info;
@@ -411,7 +406,7 @@ struct SDTypeConstraint {
   /// constraint to the nodes operands.  This returns true if it makes a
   /// change, false otherwise.  If a type contradiction is found, an error
   /// is flagged.
-  bool ApplyTypeConstraint(TreePatternNode *N, const SDNodeInfo &NodeInfo,
+  bool ApplyTypeConstraint(TreePatternNode &N, const SDNodeInfo &NodeInfo,
                            TreePattern &TP) const;
 };
 
@@ -422,6 +417,7 @@ struct SDTypeConstraint {
 class ScopedName {
   unsigned Scope;
   std::string Identifier;
+
 public:
   ScopedName(unsigned Scope, StringRef Identifier)
       : Scope(Scope), Identifier(std::string(Identifier)) {
@@ -447,6 +443,7 @@ class SDNodeInfo {
   unsigned NumResults;
   int NumOperands;
   std::vector<SDTypeConstraint> TypeConstraints;
+
 public:
   // Parse the specified record.
   SDNodeInfo(Record *R, const CodeGenHwModes &CGH);
@@ -477,7 +474,7 @@ public:
   /// constraints for this node to the operands of the node.  This returns
   /// true if it makes a change, false otherwise.  If a type contradiction is
   /// found, an error is flagged.
-  bool ApplyTypeConstraints(TreePatternNode *N, TreePattern &TP) const;
+  bool ApplyTypeConstraints(TreePatternNode &N, TreePattern &TP) const;
 };
 
 /// TreePredicateFn - This is an abstraction that represents the predicates on
@@ -487,11 +484,11 @@ class TreePredicateFn {
   /// PatFragRec - This is the TreePattern for the PatFrag that we
   /// originally came from.
   TreePattern *PatFragRec;
+
 public:
   /// TreePredicateFn constructor.  Here 'N' is a subclass of PatFrag.
   TreePredicateFn(TreePattern *N);
 
-
   TreePattern *getOrigPatFragRecord() const { return PatFragRec; }
 
   /// isAlwaysTrue - Return true if this is a noop predicate.
@@ -582,7 +579,8 @@ public:
   bool isAtomicOrderingWeakerThanRelease() const;
 
   /// If non-null, indicates that this predicate is a predefined memory VT
-  /// predicate for a load/store and returns the ValueType record for the memory VT.
+  /// predicate for a load/store and returns the ValueType record for the memory
+  /// VT.
   Record *getMemoryVT() const;
   /// If non-null, indicates that this predicate is a predefined memory VT
   /// predicate (checking only the scalar type) for load/store and returns the
@@ -615,14 +613,12 @@ struct TreePredicateCall {
   unsigned Scope;
 
   TreePredicateCall(const TreePredicateFn &Fn, unsigned Scope)
-    : Fn(Fn), Scope(Scope) {}
+      : Fn(Fn), Scope(Scope) {}
 
   bool operator==(const TreePredicateCall &o) const {
     return Fn == o.Fn && Scope == o.Scope;
   }
-  bool operator!=(const TreePredicateCall &o) const {
-    return !(*this == o);
-  }
+  bool operator!=(const TreePredicateCall &o) const { return !(*this == o); }
 };
 
 class TreePatternNode : public RefCountedBase<TreePatternNode> {
@@ -681,7 +677,7 @@ public:
   const std::vector<ScopedName> &getNamesAsPredicateArg() const {
     return NamesAsPredicateArg;
   }
-  void setNamesAsPredicateArg(const std::vector<ScopedName>& Names) {
+  void setNamesAsPredicateArg(const std::vector<ScopedName> &Names) {
     NamesAsPredicateArg = Names;
   }
   void addNameAsPredicateArg(const ScopedName &N) {
@@ -726,16 +722,14 @@ public:
   }
 
   unsigned getNumChildren() const { return Children.size(); }
-  const TreePatternNode *getChild(unsigned N) const {
-    return Children[N].get();
+  const TreePatternNode &getChild(unsigned N) const {
+    return *Children[N].get();
   }
-  TreePatternNode *getChild(unsigned N) { return Children[N].get(); }
+  TreePatternNode &getChild(unsigned N) { return *Children[N].get(); }
   const TreePatternNodePtr &getChildShared(unsigned N) const {
     return Children[N];
   }
-  TreePatternNodePtr &getChildSharedPtr(unsigned N) {
-    return Children[N];
-  }
+  TreePatternNodePtr &getChildSharedPtr(unsigned N) { return Children[N]; }
   void setChild(unsigned i, TreePatternNodePtr N) { Children[i] = N; }
 
   /// hasChild - Return true if N is any of our children.
@@ -762,7 +756,8 @@ public:
   }
   void addPredicateCall(const TreePredicateCall &Call) {
     assert(!Call.Fn.isAlwaysTrue() && "Empty predicate string!");
-    assert(!is_contained(PredicateCalls, Call) && "predicate applied recursively");
+    assert(!is_contained(PredicateCalls, Call) &&
+           "predicate applied recursively");
     PredicateCalls.push_back(Call);
   }
   void addPredicateCall(const TreePredicateFn &Fn, unsigned Scope) {
@@ -805,8 +800,7 @@ public:
   void print(raw_ostream &OS) const;
   void dump() const;
 
-public:   // Higher level manipulation routines.
-
+public: // Higher level manipulation routines.
   /// clone - Return a new copy of this tree.
   ///
   TreePatternNodePtr clone() const;
@@ -818,7 +812,7 @@ public:   // Higher level manipulation routines.
   /// the specified node.  For this comparison, all of the state of the node
   /// is considered, except for the assigned name.  Nodes with differing names
   /// that are otherwise identical are considered isomorphic.
-  bool isIsomorphicTo(const TreePatternNode *N,
+  bool isIsomorphicTo(const TreePatternNode &N,
                       const MultipleUseVarSet &DepVars) const;
 
   /// SubstituteFormalArguments - Replace the formal arguments in this tree
@@ -845,8 +839,7 @@ public:   // Higher level manipulation routines.
                       TreePattern &TP);
   bool UpdateNodeType(unsigned ResNo, MVT::SimpleValueType InTy,
                       TreePattern &TP);
-  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy,
-                      TreePattern &TP);
+  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, TreePattern &TP);
 
   // Update node type with types inferred from an instruction operand or result
   // def from the ins/outs lists.
@@ -910,7 +903,6 @@ class TreePattern {
   TypeInfer Infer;
 
 public:
-
   /// TreePattern constructor - Parse the specified DagInits into the
   /// current record.
   TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
@@ -971,12 +963,8 @@ public:
   /// error - If this is the first error in the current resolution step,
   /// print it and set the error flag.  Otherwise, continue silently.
   void error(const Twine &Msg);
-  bool hasError() const {
-    return HasError;
-  }
-  void resetError() {
-    HasError = false;
-  }
+  bool hasError() const { return HasError; }
+  void resetError() { HasError = false; }
 
   TypeInfer &getInfer() { return Infer; }
 
@@ -986,10 +974,9 @@ public:
 private:
   TreePatternNodePtr ParseTreePattern(Init *DI, StringRef OpName);
   void ComputeNamedNodes();
-  void ComputeNamedNodes(TreePatternNode *N);
+  void ComputeNamedNodes(TreePatternNode &N);
 };
 
-
 inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
                                             const TypeSetByHwMode &InTy,
                                             TreePattern &TP) {
@@ -1014,7 +1001,6 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
   return TP.getInfer().MergeInTypeInfo(Types[ResNo], VTS);
 }
 
-
 /// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps
 /// that has a set ExecuteAlways / DefaultOps field.
 struct DAGDefaultOperand {
@@ -1022,9 +1008,9 @@ struct DAGDefaultOperand {
 };
 
 class DAGInstruction {
-  std::vector<Record*> Results;
-  std::vector<Record*> Operands;
-  std::vector<Record*> ImpResults;
+  std::vector<Record *> Results;
+  std::vector<Record *> Operands;
+  std::vector<Record *> ImpResults;
   TreePatternNodePtr SrcPattern;
   TreePatternNodePtr ResultPattern;
 
@@ -1041,7 +1027,7 @@ public:
   unsigned getNumResults() const { return Results.size(); }
   unsigned getNumOperands() const { return Operands.size(); }
   unsigned getNumImpResults() const { return ImpResults.size(); }
-  const std::vector<Record*>& getImpResults() const { return ImpResults; }
+  const std::vector<Record *> &getImpResults() const { return ImpResults; }
 
   Record *getResult(unsigned RN) const {
     assert(RN < Results.size());
@@ -1065,34 +1051,33 @@ public:
 /// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns
 /// processed to produce isel.
 class PatternToMatch {
-  Record          *SrcRecord;   // Originating Record for the pattern.
-  ListInit        *Predicates;  // Top level predicate conditions to match.
-  TreePatternNodePtr SrcPattern;      // Source pattern to match.
-  TreePatternNodePtr DstPattern;      // Resulting pattern.
-  std::vector<Record*> Dstregs; // Physical register defs being matched.
-  std::string      HwModeFeatures;
-  int              AddedComplexity; // Add to matching pattern complexity.
-  unsigned         ID;          // Unique ID for the record.
+  Record *SrcRecord;             // Originating Record for the pattern.
+  ListInit *Predicates;          // Top level predicate conditions to match.
+  TreePatternNodePtr SrcPattern; // Source pattern to match.
+  TreePatternNodePtr DstPattern; // Resulting pattern.
+  std::vector<Record *> Dstregs; // Physical register defs being matched.
+  std::string HwModeFeatures;
+  int AddedComplexity; // Add to matching pattern complexity.
+  unsigned ID;         // Unique ID for the record.
 
 public:
   PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src,
                  TreePatternNodePtr dst, std::vector<Record *> dstregs,
-                 int complexity, unsigned uid,
-                 const Twine &hwmodefeatures = "")
+                 int complexity, unsigned uid, const Twine &hwmodefeatures = "")
       : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src),
         DstPattern(dst), Dstregs(std::move(dstregs)),
         HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity),
         ID(uid) {}
 
-  Record          *getSrcRecord()  const { return SrcRecord; }
-  ListInit        *getPredicates() const { return Predicates; }
-  TreePatternNode *getSrcPattern() const { return SrcPattern.get(); }
+  Record *getSrcRecord() const { return SrcRecord; }
+  ListInit *getPredicates() const { return Predicates; }
+  TreePatternNode &getSrcPattern() const { return *SrcPattern; }
   TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
-  TreePatternNode *getDstPattern() const { return DstPattern.get(); }
+  TreePatternNode &getDstPattern() const { return *DstPattern; }
   TreePatternNodePtr getDstPatternShared() const { return DstPattern; }
-  const std::vector<Record*> &getDstRegs() const { return Dstregs; }
-  StringRef   getHwModeFeatures() const { return HwModeFeatures; }
-  int         getAddedComplexity() const { return AddedComplexity; }
+  const std::vector<Record *> &getDstRegs() const { return Dstregs; }
+  StringRef getHwModeFeatures() const { return HwModeFeatures; }
+  int getAddedComplexity() const { return AddedComplexity; }
   unsigned getID() const { return ID; }
 
   std::string getPredicateCheck() const;
@@ -1108,14 +1093,14 @@ class CodeGenDAGPatterns {
   CodeGenTarget Target;
   CodeGenIntrinsicTable Intrinsics;
 
-  std::map<Record*, SDNodeInfo, LessRecordByID> SDNodes;
-  std::map<Record*, std::pair<Record*, std::string>, LessRecordByID>
+  std::map<Record *, SDNodeInfo, LessRecordByID> SDNodes;
+  std::map<Record *, std::pair<Record *, std::string>, LessRecordByID>
       SDNodeXForms;
-  std::map<Record*, ComplexPattern, LessRecordByID> ComplexPatterns;
+  std::map<Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
   std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID>
       PatternFragments;
-  std::map<Record*, DAGDefaultOperand, LessRecordByID> DefaultOperands;
-  std::map<Record*, DAGInstruction, LessRecordByID> Instructions;
+  std::map<Record *, DAGDefaultOperand, LessRecordByID> DefaultOperands;
+  std::map<Record *, DAGInstruction, LessRecordByID> Instructions;
 
   // Specific SDNode definitions:
   Record *intrinsic_void_sdnode;
@@ -1128,7 +1113,7 @@ class CodeGenDAGPatterns {
 
   TypeSetByHwMode LegalVTS;
 
-  using PatternRewriterFn = std::function<void (TreePattern *)>;
+  using PatternRewriterFn = std::function<void(TreePattern *)>;
   PatternRewriterFn PatternRewriter;
 
   unsigned NumScopes = 0;
@@ -1150,7 +1135,7 @@ public:
   }
 
   // Node transformation lookups.
-  typedef std::pair<Record*, std::string> NodeXForm;
+  typedef std::pair<Record *, std::string> NodeXForm;
   const NodeXForm &getSDNodeTransform(Record *R) const {
     auto F = SDNodeXForms.find(R);
     assert(F != SDNodeXForms.end() && "Invalid transform!");
@@ -1165,25 +1150,27 @@ public:
 
   const CodeGenIntrinsic &getIntrinsic(Record *R) const {
     for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R) return Intrinsics[i];
+      if (Intrinsics[i].TheDef == R)
+        return Intrinsics[i];
     llvm_unreachable("Unknown intrinsic!");
   }
 
   const CodeGenIntrinsic &getIntrinsicInfo(unsigned IID) const {
-    if (IID-1 < Intrinsics.size())
-      return Intrinsics[IID-1];
+    if (IID - 1 < Intrinsics.size())
+      return Intrinsics[IID - 1];
     llvm_unreachable("Bad intrinsic ID!");
   }
 
   unsigned getIntrinsicID(Record *R) const {
     for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R) return i;
+      if (Intrinsics[i].TheDef == R)
+        return i;
     llvm_unreachable("Unknown intrinsic!");
   }
 
   const DAGDefaultOperand &getDefaultOperand(Record *R) const {
     auto F = DefaultOperands.find(R);
-    assert(F != DefaultOperands.end() &&"Isn't an analyzed default operand!");
+    assert(F != DefaultOperands.end() && "Isn't an analyzed default operand!");
     return F->second;
   }
 
@@ -1213,10 +1200,9 @@ public:
   iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; }
 
   /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
-  typedef std::map<Record*, DAGInstruction, LessRecordByID> DAGInstMap;
-  void parseInstructionPattern(
-      CodeGenInstruction &CGI, ListInit *Pattern,
-      DAGInstMap &DAGInsts);
+  typedef std::map<Record *, DAGInstruction, LessRecordByID> DAGInstMap;
+  void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern,
+                               DAGInstMap &DAGInsts);
 
   const DAGInstruction &getInstruction(Record *R) const {
     auto F = Instructions.find(R);
@@ -1224,9 +1210,7 @@ public:
     return F->second;
   }
 
-  Record *get_intrinsic_void_sdnode() const {
-    return intrinsic_void_sdnode;
-  }
+  Record *get_intrinsic_void_sdnode() const { return intrinsic_void_sdnode; }
   Record *get_intrinsic_w_chain_sdnode() const {
     return intrinsic_w_chain_sdnode;
   }
@@ -1238,7 +1222,7 @@ public:
 
   bool operandHasDefault(Record *Op) const {
     return Op->isSubClassOf("OperandWithDefaultOps") &&
-      !getDefaultOperand(Op).DefaultOps.empty();
+           !getDefaultOperand(Op).DefaultOps.empty();
   }
 
 private:
@@ -1254,8 +1238,8 @@ private:
   void GenerateVariants();
   void VerifyInstructionFlags();
 
-  void ParseOnePattern(Record *TheDef,
-                       TreePattern &Pattern, TreePattern &Result,
+  void ParseOnePattern(Record *TheDef, TreePattern &Pattern,
+                       TreePattern &Result,
                        const std::vector<Record *> &InstImpResults);
   void AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM);
   void FindPatternInputsAndOutputs(
@@ -1266,14 +1250,13 @@ private:
       std::vector<Record *> &InstImpResults);
 };
 
-
-inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N,
+inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode &N,
                                              TreePattern &TP) const {
-    bool MadeChange = false;
-    for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
-      MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
-    return MadeChange;
-  }
+  bool MadeChange = false;
+  for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
+    MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
+  return MadeChange;
+}
 
 } // end namespace llvm
 
diff --git a/llvm/utils/TableGen/CodeGenHwModes.cpp b/llvm/utils/TableGen/CodeGenHwModes.cpp
index 2171507f4c63..7c08c7516054 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/CodeGenHwModes.cpp
@@ -37,17 +37,17 @@ HwMode::HwMode(Record *R) {
 }
 
 LLVM_DUMP_METHOD
-void HwMode::dump() const {
-  dbgs() << Name << ": " << Features << '\n';
-}
+void HwMode::dump() const { dbgs() << Name << ": " << Features << '\n'; }
 
 HwModeSelect::HwModeSelect(Record *R, CodeGenHwModes &CGH) {
-  std::vector<Record*> Modes = R->getValueAsListOfDefs("Modes");
-  std::vector<Record*> Objects = R->getValueAsListOfDefs("Objects");
+  std::vector<Record *> Modes = R->getValueAsListOfDefs("Modes");
+  std::vector<Record *> Objects = R->getValueAsListOfDefs("Objects");
   if (Modes.size() != Objects.size()) {
-    PrintError(R->getLoc(), "in record " + R->getName() +
-        " derived from HwModeSelect: the lists Modes and Objects should "
-        "have the same size");
+    PrintError(
+        R->getLoc(),
+        "in record " + R->getName() +
+            " derived from HwModeSelect: the lists Modes and Objects should "
+            "have the same size");
     report_fatal_error("error in target description.");
   }
   for (unsigned i = 0, e = Modes.size(); i != e; ++i) {
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 09d20ad85c5e..9a5b7a8c2c1c 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -22,46 +22,46 @@
 // HwModeId -> list of predicates (definition)
 
 namespace llvm {
-  class Record;
-  class RecordKeeper;
+class Record;
+class RecordKeeper;
 
-  struct CodeGenHwModes;
+struct CodeGenHwModes;
 
-  struct HwMode {
-    HwMode(Record *R);
-    StringRef Name;
-    std::string Features;
-    std::string Predicates;
-    void dump() const;
-  };
+struct HwMode {
+  HwMode(Record *R);
+  StringRef Name;
+  std::string Features;
+  std::string Predicates;
+  void dump() const;
+};
 
-  struct HwModeSelect {
-    HwModeSelect(Record *R, CodeGenHwModes &CGH);
-    typedef std::pair<unsigned, Record*> PairType;
-    std::vector<PairType> Items;
-    void dump() const;
-  };
+struct HwModeSelect {
+  HwModeSelect(Record *R, CodeGenHwModes &CGH);
+  typedef std::pair<unsigned, Record *> PairType;
+  std::vector<PairType> Items;
+  void dump() const;
+};
 
-  struct CodeGenHwModes {
-    enum : unsigned { DefaultMode = 0 };
-    static StringRef DefaultModeName;
+struct CodeGenHwModes {
+  enum : unsigned { DefaultMode = 0 };
+  static StringRef DefaultModeName;
 
-    CodeGenHwModes(RecordKeeper &R);
-    unsigned getHwModeId(Record *R) const;
-    const HwMode &getMode(unsigned Id) const {
-      assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
-      return Modes[Id-1];
-    }
-    const HwModeSelect &getHwModeSelect(Record *R) const;
-    unsigned getNumModeIds() const { return Modes.size()+1; }
-    void dump() const;
+  CodeGenHwModes(RecordKeeper &R);
+  unsigned getHwModeId(Record *R) const;
+  const HwMode &getMode(unsigned Id) const {
+    assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
+    return Modes[Id - 1];
+  }
+  const HwModeSelect &getHwModeSelect(Record *R) const;
+  unsigned getNumModeIds() const { return Modes.size() + 1; }
+  void dump() const;
 
-  private:
-    RecordKeeper &Records;
-    DenseMap<Record *, unsigned> ModeIds;  // HwMode Record -> HwModeId
-    std::vector<HwMode> Modes;
-    std::map<Record*,HwModeSelect> ModeSelects;
-  };
-}
+private:
+  RecordKeeper &Records;
+  DenseMap<Record *, unsigned> ModeIds; // HwMode Record -> HwModeId
+  std::vector<HwMode> Modes;
+  std::map<Record *, HwModeSelect> ModeSelects;
+};
+} // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H
diff --git a/llvm/utils/TableGen/CodeGenInstruction.cpp b/llvm/utils/TableGen/CodeGenInstruction.cpp
index 5cd89417c002..a569194ef23c 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/CodeGenInstruction.cpp
@@ -55,15 +55,15 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
   unsigned e = InDI->getNumArgs() + OutDI->getNumArgs();
   OperandList.reserve(e);
   bool VariadicOuts = false;
-  for (unsigned i = 0; i != e; ++i){
+  for (unsigned i = 0; i != e; ++i) {
     Init *ArgInit;
     StringRef ArgName;
     if (i < NumDefs) {
       ArgInit = OutDI->getArg(i);
       ArgName = OutDI->getArgNameStr(i);
     } else {
-      ArgInit = InDI->getArg(i-NumDefs);
-      ArgName = InDI->getArgNameStr(i-NumDefs);
+      ArgInit = InDI->getArg(i - NumDefs);
+      ArgName = InDI->getArgNameStr(i - NumDefs);
     }
 
     DagInit *SubArgDag = dyn_cast<DagInit>(ArgInit);
@@ -192,7 +192,6 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     --NumDefs;
 }
 
-
 /// getOperandNamed - Return the index of the operand with the specified
 /// non-empty name.  If the instruction does not have an operand with the
 /// specified name, abort.
@@ -230,7 +229,7 @@ bool CGIOperandList::hasSubOperandAlias(
   return false;
 }
 
-std::pair<unsigned,unsigned>
+std::pair<unsigned, unsigned>
 CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   if (!Op.starts_with("$"))
     PrintFatalError(TheDef->getLoc(),
@@ -242,7 +241,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   // Check to see if this is $foo.bar.
   StringRef::size_type DotIdx = OpName.find_first_of('.');
   if (DotIdx != StringRef::npos) {
-    SubOpName = OpName.substr(DotIdx+1);
+    SubOpName = OpName.substr(DotIdx + 1);
     if (SubOpName.empty())
       PrintFatalError(TheDef->getLoc(),
                       TheDef->getName() +
@@ -266,7 +265,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
 
   OpIdx = getOperandNamed(OpName);
 
-  if (SubOpName.empty()) {  // If no suboperand name was specified:
+  if (SubOpName.empty()) { // If no suboperand name was specified:
     // If one was needed, throw.
     if (OperandList[OpIdx].MINumOperands > 1 && !AllowWholeOp &&
         SubOpName.empty())
@@ -299,82 +298,80 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   return std::make_pair(0U, 0U);
 }
 
-static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
-                            Record *Rec) {
+static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
   // EARLY_CLOBBER: @early $reg
   StringRef::size_type wpos = CStr.find_first_of(" \t");
   StringRef::size_type start = CStr.find_first_not_of(" \t");
   StringRef Tok = CStr.substr(start, wpos - start);
   if (Tok == "@earlyclobber") {
-    StringRef Name = CStr.substr(wpos+1);
+    StringRef Name = CStr.substr(wpos + 1);
     wpos = Name.find_first_not_of(" \t");
     if (wpos == StringRef::npos)
-      PrintFatalError(
-        Rec->getLoc(), "Illegal format for @earlyclobber constraint in '" +
-        Rec->getName() + "': '" + CStr + "'");
+      PrintFatalError(Rec->getLoc(),
+                      "Illegal format for @earlyclobber constraint in '" +
+                          Rec->getName() + "': '" + CStr + "'");
     Name = Name.substr(wpos);
-    std::pair<unsigned,unsigned> Op = Ops.ParseOperandName(Name, false);
+    std::pair<unsigned, unsigned> Op = Ops.ParseOperandName(Name, false);
 
     // Build the string for the operand
     if (!Ops[Op.first].Constraints[Op.second].isNone())
-      PrintFatalError(
-        Rec->getLoc(), "Operand '" + Name + "' of '" + Rec->getName() +
-        "' cannot have multiple constraints!");
+      PrintFatalError(Rec->getLoc(), "Operand '" + Name + "' of '" +
+                                         Rec->getName() +
+                                         "' cannot have multiple constraints!");
     Ops[Op.first].Constraints[Op.second] =
-    CGIOperandList::ConstraintInfo::getEarlyClobber();
+        CGIOperandList::ConstraintInfo::getEarlyClobber();
     return;
   }
 
   // Only other constraint is "TIED_TO" for now.
   StringRef::size_type pos = CStr.find_first_of('=');
   if (pos == StringRef::npos)
-    PrintFatalError(
-      Rec->getLoc(), "Unrecognized constraint '" + CStr +
-      "' in '" + Rec->getName() + "'");
+    PrintFatalError(Rec->getLoc(), "Unrecognized constraint '" + CStr +
+                                       "' in '" + Rec->getName() + "'");
   start = CStr.find_first_not_of(" \t");
 
   // TIED_TO: $src1 = $dst
   wpos = CStr.find_first_of(" \t", start);
   if (wpos == StringRef::npos || wpos > pos)
-    PrintFatalError(
-      Rec->getLoc(), "Illegal format for tied-to constraint in '" +
-      Rec->getName() + "': '" + CStr + "'");
+    PrintFatalError(Rec->getLoc(),
+                    "Illegal format for tied-to constraint in '" +
+                        Rec->getName() + "': '" + CStr + "'");
   StringRef LHSOpName = CStr.substr(start, wpos - start);
-  std::pair<unsigned,unsigned> LHSOp = Ops.ParseOperandName(LHSOpName, false);
+  std::pair<unsigned, unsigned> LHSOp = Ops.ParseOperandName(LHSOpName, false);
 
   wpos = CStr.find_first_not_of(" \t", pos + 1);
   if (wpos == StringRef::npos)
-    PrintFatalError(
-      Rec->getLoc(), "Illegal format for tied-to constraint: '" + CStr + "'");
+    PrintFatalError(Rec->getLoc(),
+                    "Illegal format for tied-to constraint: '" + CStr + "'");
 
   StringRef RHSOpName = CStr.substr(wpos);
-  std::pair<unsigned,unsigned> RHSOp = Ops.ParseOperandName(RHSOpName, false);
+  std::pair<unsigned, unsigned> RHSOp = Ops.ParseOperandName(RHSOpName, false);
 
   // Sort the operands into order, which should put the output one
   // first. But keep the original order, for use in diagnostics.
   bool FirstIsDest = (LHSOp < RHSOp);
-  std::pair<unsigned,unsigned> DestOp = (FirstIsDest ? LHSOp : RHSOp);
+  std::pair<unsigned, unsigned> DestOp = (FirstIsDest ? LHSOp : RHSOp);
   StringRef DestOpName = (FirstIsDest ? LHSOpName : RHSOpName);
-  std::pair<unsigned,unsigned> SrcOp = (FirstIsDest ? RHSOp : LHSOp);
+  std::pair<unsigned, unsigned> SrcOp = (FirstIsDest ? RHSOp : LHSOp);
   StringRef SrcOpName = (FirstIsDest ? RHSOpName : LHSOpName);
 
   // Ensure one operand is a def and the other is a use.
   if (DestOp.first >= Ops.NumDefs)
-    PrintFatalError(
-      Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" + RHSOpName +
-      "' of '" + Rec->getName() + "' cannot be tied!");
+    PrintFatalError(Rec->getLoc(), "Input operands '" + LHSOpName + "' and '" +
+                                       RHSOpName + "' of '" + Rec->getName() +
+                                       "' cannot be tied!");
   if (SrcOp.first < Ops.NumDefs)
-    PrintFatalError(
-      Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" + RHSOpName +
-      "' of '" + Rec->getName() + "' cannot be tied!");
+    PrintFatalError(Rec->getLoc(), "Output operands '" + LHSOpName + "' and '" +
+                                       RHSOpName + "' of '" + Rec->getName() +
+                                       "' cannot be tied!");
 
   // The constraint has to go on the operand with higher index, i.e.
   // the source one. Check there isn't another constraint there
   // already.
   if (!Ops[SrcOp.first].Constraints[SrcOp.second].isNone())
-    PrintFatalError(
-      Rec->getLoc(), "Operand '" + SrcOpName + "' of '" + Rec->getName() +
-      "' cannot have multiple constraints!");
+    PrintFatalError(Rec->getLoc(), "Operand '" + SrcOpName + "' of '" +
+                                       Rec->getName() +
+                                       "' cannot have multiple constraints!");
 
   unsigned DestFlatOpNo = Ops.getFlattenedOperandNumber(DestOp);
   auto NewConstraint = CGIOperandList::ConstraintInfo::getTied(DestFlatOpNo);
@@ -384,16 +381,17 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
   for (const CGIOperandList::OperandInfo &Op : Ops) {
     for (unsigned i = 0; i < Op.MINumOperands; i++)
       if (Op.Constraints[i] == NewConstraint)
-        PrintFatalError(
-          Rec->getLoc(), "Operand '" + DestOpName + "' of '" + Rec->getName() +
-          "' cannot have multiple operands tied to it!");
+        PrintFatalError(Rec->getLoc(),
+                        "Operand '" + DestOpName + "' of '" + Rec->getName() +
+                            "' cannot have multiple operands tied to it!");
   }
 
   Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint;
 }
 
 static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
-  if (CStr.empty()) return;
+  if (CStr.empty())
+    return;
 
   StringRef delims(",");
   StringRef::size_type bidx, eidx;
@@ -413,15 +411,15 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
   while (true) {
     StringRef OpName;
     std::tie(OpName, DisableEncoding) = getToken(DisableEncoding, " ,\t");
-    if (OpName.empty()) break;
+    if (OpName.empty())
+      break;
 
     // Figure out which operand this is.
-    std::pair<unsigned,unsigned> Op = ParseOperandName(OpName, false);
+    std::pair<unsigned, unsigned> Op = ParseOperandName(OpName, false);
 
     // Mark the operand as not-to-be encoded.
     OperandList[Op.first].DoNotEncode[Op.second] = true;
   }
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -429,27 +427,27 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 //===----------------------------------------------------------------------===//
 
 CodeGenInstruction::CodeGenInstruction(Record *R)
-  : TheDef(R), Operands(R), InferredFrom(nullptr) {
+    : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = std::string(R->getValueAsString("AsmString"));
 
   isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
-  isReturn     = R->getValueAsBit("isReturn");
+  isReturn = R->getValueAsBit("isReturn");
   isEHScopeReturn = R->getValueAsBit("isEHScopeReturn");
-  isBranch     = R->getValueAsBit("isBranch");
+  isBranch = R->getValueAsBit("isBranch");
   isIndirectBranch = R->getValueAsBit("isIndirectBranch");
-  isCompare    = R->getValueAsBit("isCompare");
-  isMoveImm    = R->getValueAsBit("isMoveImm");
-  isMoveReg    = R->getValueAsBit("isMoveReg");
-  isBitcast    = R->getValueAsBit("isBitcast");
-  isSelect     = R->getValueAsBit("isSelect");
-  isBarrier    = R->getValueAsBit("isBarrier");
-  isCall       = R->getValueAsBit("isCall");
-  isAdd        = R->getValueAsBit("isAdd");
-  isTrap       = R->getValueAsBit("isTrap");
+  isCompare = R->getValueAsBit("isCompare");
+  isMoveImm = R->getValueAsBit("isMoveImm");
+  isMoveReg = R->getValueAsBit("isMoveReg");
+  isBitcast = R->getValueAsBit("isBitcast");
+  isSelect = R->getValueAsBit("isSelect");
+  isBarrier = R->getValueAsBit("isBarrier");
+  isCall = R->getValueAsBit("isCall");
+  isAdd = R->getValueAsBit("isAdd");
+  isTrap = R->getValueAsBit("isTrap");
   canFoldAsLoad = R->getValueAsBit("canFoldAsLoad");
-  isPredicable = !R->getValueAsBit("isUnpredicable") && (
-      Operands.isPredicable || R->getValueAsBit("isPredicable"));
+  isPredicable = !R->getValueAsBit("isUnpredicable") &&
+                 (Operands.isPredicable || R->getValueAsBit("isPredicable"));
   isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress");
   isCommutable = R->getValueAsBit("isCommutable");
   isTerminator = R->getValueAsBit("isTerminator");
@@ -457,7 +455,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   hasDelaySlot = R->getValueAsBit("hasDelaySlot");
   usesCustomInserter = R->getValueAsBit("usesCustomInserter");
   hasPostISelHook = R->getValueAsBit("hasPostISelHook");
-  hasCtrlDep   = R->getValueAsBit("hasCtrlDep");
+  hasCtrlDep = R->getValueAsBit("hasCtrlDep");
   isNotDuplicable = R->getValueAsBit("isNotDuplicable");
   isRegSequence = R->getValueAsBit("isRegSequence");
   isExtractSubreg = R->getValueAsBit("isExtractSubreg");
@@ -469,9 +467,9 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   isAuthenticated = R->getValueAsBit("isAuthenticated");
 
   bool Unset;
-  mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
+  mayLoad = R->getValueAsBitOrUnset("mayLoad", Unset);
   mayLoad_Unset = Unset;
-  mayStore     = R->getValueAsBitOrUnset("mayStore", Unset);
+  mayStore = R->getValueAsBitOrUnset("mayStore", Unset);
   mayStore_Unset = Unset;
   mayRaiseFPException = R->getValueAsBit("mayRaiseFPException");
   hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects", Unset);
@@ -494,8 +492,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   ParseConstraints(R->getValueAsString("Constraints"), Operands, R);
 
   // Parse the DisableEncoding field.
-  Operands.ProcessDisableEncoding(
-      R->getValueAsString("DisableEncoding"));
+  Operands.ProcessDisableEncoding(R->getValueAsString("DisableEncoding"));
 
   // First check for a ComplexDeprecationPredicate.
   if (R->getValue("ComplexDeprecationPredicate")) {
@@ -516,25 +513,25 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
 /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
 /// implicit def and it has a known VT, return the VT, otherwise return
 /// MVT::Other.
-MVT::SimpleValueType CodeGenInstruction::
-HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const {
-  if (ImplicitDefs.empty()) return MVT::Other;
+MVT::SimpleValueType CodeGenInstruction::HasOneImplicitDefWithKnownVT(
+    const CodeGenTarget &TargetInfo) const {
+  if (ImplicitDefs.empty())
+    return MVT::Other;
 
   // Check to see if the first implicit def has a resolvable type.
   Record *FirstImplicitDef = ImplicitDefs[0];
   assert(FirstImplicitDef->isSubClassOf("Register"));
   const std::vector<ValueTypeByHwMode> &RegVTs =
-    TargetInfo.getRegisterVTs(FirstImplicitDef);
+      TargetInfo.getRegisterVTs(FirstImplicitDef);
   if (RegVTs.size() == 1 && RegVTs[0].isSimple())
     return RegVTs[0].getSimple().SimpleTy;
   return MVT::Other;
 }
 
-
 /// FlattenAsmStringVariants - Flatten the specified AsmString to only
 /// include text from the specified variant, returning the new string.
-std::string CodeGenInstruction::
-FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
+std::string CodeGenInstruction::FlattenAsmStringVariants(StringRef Cur,
+                                                         unsigned Variant) {
   std::string Res;
 
   for (;;) {
@@ -542,8 +539,8 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
     size_t VariantsStart = 0;
     for (size_t e = Cur.size(); VariantsStart != e; ++VariantsStart)
       if (Cur[VariantsStart] == '{' &&
-          (VariantsStart == 0 || (Cur[VariantsStart-1] != '$' &&
-                                  Cur[VariantsStart-1] != '\\')))
+          (VariantsStart == 0 ||
+           (Cur[VariantsStart - 1] != '$' && Cur[VariantsStart - 1] != '\\')))
         break;
 
     // Add the prefix to the result.
@@ -557,7 +554,7 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
     size_t VariantsEnd = VariantsStart;
     unsigned NestedBraces = 1;
     for (size_t e = Cur.size(); VariantsEnd != e; ++VariantsEnd) {
-      if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd-1] != '\\') {
+      if (Cur[VariantsEnd] == '}' && Cur[VariantsEnd - 1] != '\\') {
         if (--NestedBraces == 0)
           break;
       } else if (Cur[VariantsEnd] == '{')
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 4a34c296517c..ca7b1e9f7f4a 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -23,324 +23,320 @@
 #include <vector>
 
 namespace llvm {
-  class Record;
-  class DagInit;
-  class CodeGenTarget;
+class Record;
+class DagInit;
+class CodeGenTarget;
 
-  class CGIOperandList {
-  public:
-    class ConstraintInfo {
-      enum { None, EarlyClobber, Tied } Kind = None;
-      unsigned OtherTiedOperand = 0;
-
-    public:
-      ConstraintInfo() = default;
-
-      static ConstraintInfo getEarlyClobber() {
-        ConstraintInfo I;
-        I.Kind = EarlyClobber;
-        I.OtherTiedOperand = 0;
-        return I;
-      }
-
-      static ConstraintInfo getTied(unsigned Op) {
-        ConstraintInfo I;
-        I.Kind = Tied;
-        I.OtherTiedOperand = Op;
-        return I;
-      }
-
-      bool isNone() const { return Kind == None; }
-      bool isEarlyClobber() const { return Kind == EarlyClobber; }
-      bool isTied() const { return Kind == Tied; }
+class CGIOperandList {
+public:
+  class ConstraintInfo {
+    enum { None, EarlyClobber, Tied } Kind = None;
+    unsigned OtherTiedOperand = 0;
 
-      unsigned getTiedOperand() const {
-        assert(isTied());
-        return OtherTiedOperand;
-      }
-
-      bool operator==(const ConstraintInfo &RHS) const {
-        if (Kind != RHS.Kind)
-          return false;
-        if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand)
-          return false;
-        return true;
-      }
-      bool operator!=(const ConstraintInfo &RHS) const {
-        return !(*this == RHS);
-      }
-    };
-
-    /// OperandInfo - The information we keep track of for each operand in the
-    /// operand list for a tablegen instruction.
-    struct OperandInfo {
-      /// Rec - The definition this operand is declared as.
-      ///
-      Record *Rec;
-
-      /// Name - If this operand was assigned a symbolic name, this is it,
-      /// otherwise, it's empty.
-      std::string Name;
-
-      /// The names of sub-operands, if given, otherwise empty.
-      std::vector<std::string> SubOpNames;
-
-      /// PrinterMethodName - The method used to print operands of this type in
-      /// the asmprinter.
-      std::string PrinterMethodName;
-
-      /// The method used to get the machine operand value for binary
-      /// encoding, per sub-operand. If empty, uses "getMachineOpValue".
-      std::vector<std::string> EncoderMethodNames;
-
-      /// OperandType - A value from MCOI::OperandType representing the type of
-      /// the operand.
-      std::string OperandType;
-
-      /// MIOperandNo - Currently (this is meant to be phased out), some logical
-      /// operands correspond to multiple MachineInstr operands.  In the X86
-      /// target for example, one address operand is represented as 4
-      /// MachineOperands.  Because of this, the operand number in the
-      /// OperandList may not match the MachineInstr operand num.  Until it
-      /// does, this contains the MI operand index of this operand.
-      unsigned MIOperandNo;
-      unsigned MINumOperands;   // The number of operands.
-
-      /// DoNotEncode - Bools are set to true in this vector for each operand in
-      /// the DisableEncoding list.  These should not be emitted by the code
-      /// emitter.
-      BitVector DoNotEncode;
-
-      /// MIOperandInfo - Default MI operand type. Note an operand may be made
-      /// up of multiple MI operands.
-      DagInit *MIOperandInfo;
-
-      /// Constraint info for this operand.  This operand can have pieces, so we
-      /// track constraint info for each.
-      std::vector<ConstraintInfo> Constraints;
-
-      OperandInfo(Record *R, const std::string &N, const std::string &PMN,
-                  const std::string &OT, unsigned MION, unsigned MINO,
-                  DagInit *MIOI)
-          : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN),
-            EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION),
-            MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI),
-            Constraints(MINO) {}
-
-      /// getTiedOperand - If this operand is tied to another one, return the
-      /// other operand number.  Otherwise, return -1.
-      int getTiedRegister() const {
-        for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
-          const CGIOperandList::ConstraintInfo &CI = Constraints[j];
-          if (CI.isTied()) return CI.getTiedOperand();
-        }
-        return -1;
-      }
-    };
-
-    CGIOperandList(Record *D);
-
-    Record *TheDef;            // The actual record containing this OperandList.
+  public:
+    ConstraintInfo() = default;
 
-    /// NumDefs - Number of def operands declared, this is the number of
-    /// elements in the instruction's (outs) list.
-    ///
-    unsigned NumDefs;
-
-    /// OperandList - The list of declared operands, along with their declared
-    /// type (which is a record).
-    std::vector<OperandInfo> OperandList;
-
-    /// SubOpAliases - List of alias names for suboperands.
-    StringMap<std::pair<unsigned, unsigned>> SubOpAliases;
-
-    // Information gleaned from the operand list.
-    bool isPredicable;
-    bool hasOptionalDef;
-    bool isVariadic;
-
-    // Provide transparent accessors to the operand list.
-    bool empty() const { return OperandList.empty(); }
-    unsigned size() const { return OperandList.size(); }
-    const OperandInfo &operator[](unsigned i) const { return OperandList[i]; }
-    OperandInfo &operator[](unsigned i) { return OperandList[i]; }
-    OperandInfo &back() { return OperandList.back(); }
-    const OperandInfo &back() const { return OperandList.back(); }
-
-    typedef std::vector<OperandInfo>::iterator iterator;
-    typedef std::vector<OperandInfo>::const_iterator const_iterator;
-    iterator begin() { return OperandList.begin(); }
-    const_iterator begin() const { return OperandList.begin(); }
-    iterator end() { return OperandList.end(); }
-    const_iterator end() const { return OperandList.end(); }
-
-    /// getOperandNamed - Return the index of the operand with the specified
-    /// non-empty name.  If the instruction does not have an operand with the
-    /// specified name, abort.
-    unsigned getOperandNamed(StringRef Name) const;
-
-    /// hasOperandNamed - Query whether the instruction has an operand of the
-    /// given name. If so, return true and set OpIdx to the index of the
-    /// operand. Otherwise, return false.
-    bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const;
-
-    bool hasSubOperandAlias(StringRef Name,
-                            std::pair<unsigned, unsigned> &SubOp) const;
-
-    /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar",
-    /// where $foo is a whole operand and $foo.bar refers to a suboperand.
-    /// This aborts if the name is invalid.  If AllowWholeOp is true, references
-    /// to operands with suboperands are allowed, otherwise not.
-    std::pair<unsigned,unsigned> ParseOperandName(StringRef Op,
-                                                  bool AllowWholeOp = true);
-
-    /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a
-    /// flat machineinstr operand #.
-    unsigned getFlattenedOperandNumber(std::pair<unsigned,unsigned> Op) const {
-      return OperandList[Op.first].MIOperandNo + Op.second;
+    static ConstraintInfo getEarlyClobber() {
+      ConstraintInfo I;
+      I.Kind = EarlyClobber;
+      I.OtherTiedOperand = 0;
+      return I;
     }
 
-    /// getSubOperandNumber - Unflatten a operand number into an
-    /// operand/suboperand pair.
-    std::pair<unsigned,unsigned> getSubOperandNumber(unsigned Op) const {
-      for (unsigned i = 0; ; ++i) {
-        assert(i < OperandList.size() && "Invalid flat operand #");
-        if (OperandList[i].MIOperandNo+OperandList[i].MINumOperands > Op)
-          return std::make_pair(i, Op-OperandList[i].MIOperandNo);
-      }
+    static ConstraintInfo getTied(unsigned Op) {
+      ConstraintInfo I;
+      I.Kind = Tied;
+      I.OtherTiedOperand = Op;
+      return I;
     }
 
+    bool isNone() const { return Kind == None; }
+    bool isEarlyClobber() const { return Kind == EarlyClobber; }
+    bool isTied() const { return Kind == Tied; }
 
-    /// isFlatOperandNotEmitted - Return true if the specified flat operand #
-    /// should not be emitted with the code emitter.
-    bool isFlatOperandNotEmitted(unsigned FlatOpNo) const {
-      std::pair<unsigned,unsigned> Op = getSubOperandNumber(FlatOpNo);
-      if (OperandList[Op.first].DoNotEncode.size() > Op.second)
-        return OperandList[Op.first].DoNotEncode[Op.second];
-      return false;
+    unsigned getTiedOperand() const {
+      assert(isTied());
+      return OtherTiedOperand;
     }
 
-    void ProcessDisableEncoding(StringRef Value);
-  };
-
-
-  class CodeGenInstruction {
-  public:
-    Record *TheDef;            // The actual record defining this instruction.
-    StringRef Namespace;       // The namespace the instruction is in.
-
-    /// AsmString - The format string used to emit a .s file for the
-    /// instruction.
-    std::string AsmString;
-
-    /// Operands - This is information about the (ins) and (outs) list specified
-    /// to the instruction.
-    CGIOperandList Operands;
-
-    /// ImplicitDefs/ImplicitUses - These are lists of registers that are
-    /// implicitly defined and used by the instruction.
-    std::vector<Record*> ImplicitDefs, ImplicitUses;
-
-    // Various boolean values we track for the instruction.
-    bool isPreISelOpcode : 1;
-    bool isReturn : 1;
-    bool isEHScopeReturn : 1;
-    bool isBranch : 1;
-    bool isIndirectBranch : 1;
-    bool isCompare : 1;
-    bool isMoveImm : 1;
-    bool isMoveReg : 1;
-    bool isBitcast : 1;
-    bool isSelect : 1;
-    bool isBarrier : 1;
-    bool isCall : 1;
-    bool isAdd : 1;
-    bool isTrap : 1;
-    bool canFoldAsLoad : 1;
-    bool mayLoad : 1;
-    bool mayLoad_Unset : 1;
-    bool mayStore : 1;
-    bool mayStore_Unset : 1;
-    bool mayRaiseFPException : 1;
-    bool isPredicable : 1;
-    bool isConvertibleToThreeAddress : 1;
-    bool isCommutable : 1;
-    bool isTerminator : 1;
-    bool isReMaterializable : 1;
-    bool hasDelaySlot : 1;
-    bool usesCustomInserter : 1;
-    bool hasPostISelHook : 1;
-    bool hasCtrlDep : 1;
-    bool isNotDuplicable : 1;
-    bool hasSideEffects : 1;
-    bool hasSideEffects_Unset : 1;
-    bool isAsCheapAsAMove : 1;
-    bool hasExtraSrcRegAllocReq : 1;
-    bool hasExtraDefRegAllocReq : 1;
-    bool isCodeGenOnly : 1;
-    bool isPseudo : 1;
-    bool isMeta : 1;
-    bool isRegSequence : 1;
-    bool isExtractSubreg : 1;
-    bool isInsertSubreg : 1;
-    bool isConvergent : 1;
-    bool hasNoSchedulingInfo : 1;
-    bool FastISelShouldIgnore : 1;
-    bool hasChain : 1;
-    bool hasChain_Inferred : 1;
-    bool variadicOpsAreDefs : 1;
-    bool isAuthenticated : 1;
-
-    std::string DeprecatedReason;
-    bool HasComplexDeprecationPredicate;
-
-    /// Are there any undefined flags?
-    bool hasUndefFlags() const {
-      return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
-    }
-
-    // The record used to infer instruction flags, or NULL if no flag values
-    // have been inferred.
-    Record *InferredFrom;
-
-    // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
-    mutable unsigned EnumVal;
-
-    CodeGenInstruction(Record *R);
-
-    /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
-    /// implicit def and it has a known VT, return the VT, otherwise return
-    /// MVT::Other.
-    MVT::SimpleValueType
-      HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const;
-
-
-    /// FlattenAsmStringVariants - Flatten the specified AsmString to only
-    /// include text from the specified variant, returning the new string.
-    static std::string FlattenAsmStringVariants(StringRef AsmString,
-                                                unsigned Variant);
-
-    // Is the specified operand in a generic instruction implicitly a pointer.
-    // This can be used on intructions that use typeN or ptypeN to identify
-    // operands that should be considered as pointers even though SelectionDAG
-    // didn't make a distinction between integer and pointers.
-    bool isInOperandAPointer(unsigned i) const {
-      return isOperandImpl("InOperandList", i, "IsPointer");
+    bool operator==(const ConstraintInfo &RHS) const {
+      if (Kind != RHS.Kind)
+        return false;
+      if (Kind == Tied && OtherTiedOperand != RHS.OtherTiedOperand)
+        return false;
+      return true;
     }
+    bool operator!=(const ConstraintInfo &RHS) const { return !(*this == RHS); }
+  };
 
-    bool isOutOperandAPointer(unsigned i) const {
-      return isOperandImpl("OutOperandList", i, "IsPointer");
+  /// OperandInfo - The information we keep track of for each operand in the
+  /// operand list for a tablegen instruction.
+  struct OperandInfo {
+    /// Rec - The definition this operand is declared as.
+    ///
+    Record *Rec;
+
+    /// Name - If this operand was assigned a symbolic name, this is it,
+    /// otherwise, it's empty.
+    std::string Name;
+
+    /// The names of sub-operands, if given, otherwise empty.
+    std::vector<std::string> SubOpNames;
+
+    /// PrinterMethodName - The method used to print operands of this type in
+    /// the asmprinter.
+    std::string PrinterMethodName;
+
+    /// The method used to get the machine operand value for binary
+    /// encoding, per sub-operand. If empty, uses "getMachineOpValue".
+    std::vector<std::string> EncoderMethodNames;
+
+    /// OperandType - A value from MCOI::OperandType representing the type of
+    /// the operand.
+    std::string OperandType;
+
+    /// MIOperandNo - Currently (this is meant to be phased out), some logical
+    /// operands correspond to multiple MachineInstr operands.  In the X86
+    /// target for example, one address operand is represented as 4
+    /// MachineOperands.  Because of this, the operand number in the
+    /// OperandList may not match the MachineInstr operand num.  Until it
+    /// does, this contains the MI operand index of this operand.
+    unsigned MIOperandNo;
+    unsigned MINumOperands; // The number of operands.
+
+    /// DoNotEncode - Bools are set to true in this vector for each operand in
+    /// the DisableEncoding list.  These should not be emitted by the code
+    /// emitter.
+    BitVector DoNotEncode;
+
+    /// MIOperandInfo - Default MI operand type. Note an operand may be made
+    /// up of multiple MI operands.
+    DagInit *MIOperandInfo;
+
+    /// Constraint info for this operand.  This operand can have pieces, so we
+    /// track constraint info for each.
+    std::vector<ConstraintInfo> Constraints;
+
+    OperandInfo(Record *R, const std::string &N, const std::string &PMN,
+                const std::string &OT, unsigned MION, unsigned MINO,
+                DagInit *MIOI)
+        : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN),
+          EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION),
+          MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI),
+          Constraints(MINO) {}
+
+    /// getTiedOperand - If this operand is tied to another one, return the
+    /// other operand number.  Otherwise, return -1.
+    int getTiedRegister() const {
+      for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
+        const CGIOperandList::ConstraintInfo &CI = Constraints[j];
+        if (CI.isTied())
+          return CI.getTiedOperand();
+      }
+      return -1;
     }
+  };
 
-    /// Check if the operand is required to be an immediate.
-    bool isInOperandImmArg(unsigned i) const {
-      return isOperandImpl("InOperandList", i, "IsImmediate");
+  CGIOperandList(Record *D);
+
+  Record *TheDef; // The actual record containing this OperandList.
+
+  /// NumDefs - Number of def operands declared, this is the number of
+  /// elements in the instruction's (outs) list.
+  ///
+  unsigned NumDefs;
+
+  /// OperandList - The list of declared operands, along with their declared
+  /// type (which is a record).
+  std::vector<OperandInfo> OperandList;
+
+  /// SubOpAliases - List of alias names for suboperands.
+  StringMap<std::pair<unsigned, unsigned>> SubOpAliases;
+
+  // Information gleaned from the operand list.
+  bool isPredicable;
+  bool hasOptionalDef;
+  bool isVariadic;
+
+  // Provide transparent accessors to the operand list.
+  bool empty() const { return OperandList.empty(); }
+  unsigned size() const { return OperandList.size(); }
+  const OperandInfo &operator[](unsigned i) const { return OperandList[i]; }
+  OperandInfo &operator[](unsigned i) { return OperandList[i]; }
+  OperandInfo &back() { return OperandList.back(); }
+  const OperandInfo &back() const { return OperandList.back(); }
+
+  typedef std::vector<OperandInfo>::iterator iterator;
+  typedef std::vector<OperandInfo>::const_iterator const_iterator;
+  iterator begin() { return OperandList.begin(); }
+  const_iterator begin() const { return OperandList.begin(); }
+  iterator end() { return OperandList.end(); }
+  const_iterator end() const { return OperandList.end(); }
+
+  /// getOperandNamed - Return the index of the operand with the specified
+  /// non-empty name.  If the instruction does not have an operand with the
+  /// specified name, abort.
+  unsigned getOperandNamed(StringRef Name) const;
+
+  /// hasOperandNamed - Query whether the instruction has an operand of the
+  /// given name. If so, return true and set OpIdx to the index of the
+  /// operand. Otherwise, return false.
+  bool hasOperandNamed(StringRef Name, unsigned &OpIdx) const;
+
+  bool hasSubOperandAlias(StringRef Name,
+                          std::pair<unsigned, unsigned> &SubOp) const;
+
+  /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar",
+  /// where $foo is a whole operand and $foo.bar refers to a suboperand.
+  /// This aborts if the name is invalid.  If AllowWholeOp is true, references
+  /// to operands with suboperands are allowed, otherwise not.
+  std::pair<unsigned, unsigned> ParseOperandName(StringRef Op,
+                                                 bool AllowWholeOp = true);
+
+  /// getFlattenedOperandNumber - Flatten a operand/suboperand pair into a
+  /// flat machineinstr operand #.
+  unsigned getFlattenedOperandNumber(std::pair<unsigned, unsigned> Op) const {
+    return OperandList[Op.first].MIOperandNo + Op.second;
+  }
+
+  /// getSubOperandNumber - Unflatten a operand number into an
+  /// operand/suboperand pair.
+  std::pair<unsigned, unsigned> getSubOperandNumber(unsigned Op) const {
+    for (unsigned i = 0;; ++i) {
+      assert(i < OperandList.size() && "Invalid flat operand #");
+      if (OperandList[i].MIOperandNo + OperandList[i].MINumOperands > Op)
+        return std::make_pair(i, Op - OperandList[i].MIOperandNo);
     }
-
-  private:
-    bool isOperandImpl(StringRef OpListName, unsigned i,
-                       StringRef PropertyName) const;
-  };
+  }
+
+  /// isFlatOperandNotEmitted - Return true if the specified flat operand #
+  /// should not be emitted with the code emitter.
+  bool isFlatOperandNotEmitted(unsigned FlatOpNo) const {
+    std::pair<unsigned, unsigned> Op = getSubOperandNumber(FlatOpNo);
+    if (OperandList[Op.first].DoNotEncode.size() > Op.second)
+      return OperandList[Op.first].DoNotEncode[Op.second];
+    return false;
+  }
+
+  void ProcessDisableEncoding(StringRef Value);
+};
+
+class CodeGenInstruction {
+public:
+  Record *TheDef;      // The actual record defining this instruction.
+  StringRef Namespace; // The namespace the instruction is in.
+
+  /// AsmString - The format string used to emit a .s file for the
+  /// instruction.
+  std::string AsmString;
+
+  /// Operands - This is information about the (ins) and (outs) list specified
+  /// to the instruction.
+  CGIOperandList Operands;
+
+  /// ImplicitDefs/ImplicitUses - These are lists of registers that are
+  /// implicitly defined and used by the instruction.
+  std::vector<Record *> ImplicitDefs, ImplicitUses;
+
+  // Various boolean values we track for the instruction.
+  bool isPreISelOpcode : 1;
+  bool isReturn : 1;
+  bool isEHScopeReturn : 1;
+  bool isBranch : 1;
+  bool isIndirectBranch : 1;
+  bool isCompare : 1;
+  bool isMoveImm : 1;
+  bool isMoveReg : 1;
+  bool isBitcast : 1;
+  bool isSelect : 1;
+  bool isBarrier : 1;
+  bool isCall : 1;
+  bool isAdd : 1;
+  bool isTrap : 1;
+  bool canFoldAsLoad : 1;
+  bool mayLoad : 1;
+  bool mayLoad_Unset : 1;
+  bool mayStore : 1;
+  bool mayStore_Unset : 1;
+  bool mayRaiseFPException : 1;
+  bool isPredicable : 1;
+  bool isConvertibleToThreeAddress : 1;
+  bool isCommutable : 1;
+  bool isTerminator : 1;
+  bool isReMaterializable : 1;
+  bool hasDelaySlot : 1;
+  bool usesCustomInserter : 1;
+  bool hasPostISelHook : 1;
+  bool hasCtrlDep : 1;
+  bool isNotDuplicable : 1;
+  bool hasSideEffects : 1;
+  bool hasSideEffects_Unset : 1;
+  bool isAsCheapAsAMove : 1;
+  bool hasExtraSrcRegAllocReq : 1;
+  bool hasExtraDefRegAllocReq : 1;
+  bool isCodeGenOnly : 1;
+  bool isPseudo : 1;
+  bool isMeta : 1;
+  bool isRegSequence : 1;
+  bool isExtractSubreg : 1;
+  bool isInsertSubreg : 1;
+  bool isConvergent : 1;
+  bool hasNoSchedulingInfo : 1;
+  bool FastISelShouldIgnore : 1;
+  bool hasChain : 1;
+  bool hasChain_Inferred : 1;
+  bool variadicOpsAreDefs : 1;
+  bool isAuthenticated : 1;
+
+  std::string DeprecatedReason;
+  bool HasComplexDeprecationPredicate;
+
+  /// Are there any undefined flags?
+  bool hasUndefFlags() const {
+    return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
+  }
+
+  // The record used to infer instruction flags, or NULL if no flag values
+  // have been inferred.
+  Record *InferredFrom;
+
+  // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
+  mutable unsigned EnumVal;
+
+  CodeGenInstruction(Record *R);
+
+  /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
+  /// implicit def and it has a known VT, return the VT, otherwise return
+  /// MVT::Other.
+  MVT::SimpleValueType
+  HasOneImplicitDefWithKnownVT(const CodeGenTarget &TargetInfo) const;
+
+  /// FlattenAsmStringVariants - Flatten the specified AsmString to only
+  /// include text from the specified variant, returning the new string.
+  static std::string FlattenAsmStringVariants(StringRef AsmString,
+                                              unsigned Variant);
+
+  // Is the specified operand in a generic instruction implicitly a pointer.
+  // This can be used on intructions that use typeN or ptypeN to identify
+  // operands that should be considered as pointers even though SelectionDAG
+  // didn't make a distinction between integer and pointers.
+  bool isInOperandAPointer(unsigned i) const {
+    return isOperandImpl("InOperandList", i, "IsPointer");
+  }
+
+  bool isOutOperandAPointer(unsigned i) const {
+    return isOperandImpl("OutOperandList", i, "IsPointer");
+  }
+
+  /// Check if the operand is required to be an immediate.
+  bool isInOperandImmArg(unsigned i) const {
+    return isOperandImpl("InOperandList", i, "IsImmediate");
+  }
+
+private:
+  bool isOperandImpl(StringRef OpListName, unsigned i,
+                     StringRef PropertyName) const;
+};
 } // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index f3452f5acea8..da9e386d64d1 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -25,12 +25,12 @@ class Record;
 class RecordKeeper;
 
 struct CodeGenIntrinsic {
-  Record *TheDef;             // The actual record defining this intrinsic.
-  std::string Name;           // The name of the LLVM function "llvm.bswap.i32"
-  std::string EnumName;       // The name of the enum "bswap_i32"
+  Record *TheDef;       // The actual record defining this intrinsic.
+  std::string Name;     // The name of the LLVM function "llvm.bswap.i32"
+  std::string EnumName; // The name of the enum "bswap_i32"
   std::string ClangBuiltinName; // Name of the corresponding GCC builtin, or "".
-  std::string MSBuiltinName;  // Name of the corresponding MS builtin, or "".
-  std::string TargetPrefix;   // Target prefix, e.g. "ppc" for t-s intrinsics.
+  std::string MSBuiltinName;    // Name of the corresponding MS builtin, or "".
+  std::string TargetPrefix;     // Target prefix, e.g. "ppc" for t-s intrinsics.
 
   /// This structure holds the return values and parameter values of an
   /// intrinsic. If the number of return values is > 1, then the intrinsic
@@ -136,9 +136,7 @@ struct CodeGenIntrinsic {
 
   void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0);
 
-  bool hasProperty(enum SDNP Prop) const {
-    return Properties & (1 << Prop);
-  }
+  bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
 
   /// Goes through all IntrProperties that have IsDefault
   /// value set and sets the property.
@@ -182,6 +180,6 @@ public:
     return Intrinsics[Pos];
   }
 };
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index fd375735dfd2..03af0b49ba97 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -80,9 +80,9 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 using namespace llvm;
-typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
+typedef std::map<std::string, std::vector<Record *>> InstrRelMapTy;
 
-typedef std::map<std::vector<Init*>, std::vector<Record*> > RowInstrMapTy;
+typedef std::map<std::vector<Init *>, std::vector<Record *>> RowInstrMapTy;
 
 namespace {
 
@@ -95,10 +95,10 @@ private:
   ListInit *RowFields;
   ListInit *ColFields;
   ListInit *KeyCol;
-  std::vector<ListInit*> ValueCols;
+  std::vector<ListInit *> ValueCols;
 
 public:
-  InstrMap(Record* MapRec) {
+  InstrMap(Record *MapRec) {
     Name = std::string(MapRec->getName());
 
     // FilterClass - It's used to reduce the search space only to the
@@ -130,7 +130,8 @@ public:
     // Each instruction map must specify at least one column for it to be valid.
     if (ColValList->empty())
       PrintFatalError(MapRec->getLoc(), "InstrMapping record `" +
-        MapRec->getName() + "' has empty " + "`ValueCols' field!");
+                                            MapRec->getName() + "' has empty " +
+                                            "`ValueCols' field!");
 
     for (Init *I : ColValList->getValues()) {
       auto *ColI = cast<ListInit>(I);
@@ -138,9 +139,10 @@ public:
       // Make sure that all the sub-lists in 'ValueCols' have same number of
       // elements as the fields in 'ColFields'.
       if (ColI->size() != ColFields->size())
-        PrintFatalError(MapRec->getLoc(), "Record `" + MapRec->getName() +
-          "', field `ValueCols' entries don't match with " +
-          " the entries in 'ColFields'!");
+        PrintFatalError(MapRec->getLoc(),
+                        "Record `" + MapRec->getName() +
+                            "', field `ValueCols' entries don't match with " +
+                            " the entries in 'ColFields'!");
       ValueCols.push_back(ColI);
     }
   }
@@ -155,13 +157,10 @@ public:
 
   ListInit *getKeyCol() const { return KeyCol; }
 
-  const std::vector<ListInit*> &getValueCols() const {
-    return ValueCols;
-  }
+  const std::vector<ListInit *> &getValueCols() const { return ValueCols; }
 };
 } // end anonymous namespace
 
-
 //===----------------------------------------------------------------------===//
 // class MapTableEmitter : It builds the instruction relation maps using
 // the information provided in InstrMapping records. It outputs these
@@ -171,26 +170,26 @@ public:
 namespace {
 class MapTableEmitter {
 private:
-//  std::string TargetName;
+  //  std::string TargetName;
   const CodeGenTarget &Target;
   // InstrMapDesc - InstrMapping record to be processed.
   InstrMap InstrMapDesc;
 
   // InstrDefs - list of instructions filtered using FilterClass defined
   // in InstrMapDesc.
-  std::vector<Record*> InstrDefs;
+  std::vector<Record *> InstrDefs;
 
   // RowInstrMap - maps RowFields values to the instructions. It's keyed by the
   // values of the row fields and contains vector of records as values.
   RowInstrMapTy RowInstrMap;
 
   // KeyInstrVec - list of key instructions.
-  std::vector<Record*> KeyInstrVec;
-  DenseMap<Record*, std::vector<Record*> > MapTable;
+  std::vector<Record *> KeyInstrVec;
+  DenseMap<Record *, std::vector<Record *>> MapTable;
 
 public:
-  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec):
-                  Target(Target), InstrMapDesc(IMRec) {
+  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec)
+      : Target(Target), InstrMapDesc(IMRec) {
     const std::string &FilterClass = InstrMapDesc.getFilterClass();
     InstrDefs = Records.getAllDerivedDefinitions(FilterClass);
   }
@@ -199,7 +198,7 @@ public:
 
   // Returns true if an instruction is a key instruction, i.e., its ColFields
   // have same values as KeyCol.
-  bool isKeyColInstr(Record* CurInstr);
+  bool isKeyColInstr(Record *CurInstr);
 
   // Find column instruction corresponding to a key instruction based on the
   // constraints for that column.
@@ -215,11 +214,9 @@ public:
 
   // Lookup functions to query binary search tables.
   void emitMapFuncBody(raw_ostream &OS, unsigned TableSize);
-
 };
 } // end anonymous namespace
 
-
 //===----------------------------------------------------------------------===//
 // Process all the instructions that model this relation (alreday present in
 // InstrDefs) and insert them into RowInstrMap which is keyed by the values of
@@ -230,14 +227,15 @@ public:
 
 void MapTableEmitter::buildRowInstrMap() {
   for (Record *CurInstr : InstrDefs) {
-    std::vector<Init*> KeyValue;
+    std::vector<Init *> KeyValue;
     ListInit *RowFields = InstrMapDesc.getRowFields();
     for (Init *RowField : RowFields->getValues()) {
       RecordVal *RecVal = CurInstr->getValue(RowField);
       if (RecVal == nullptr)
-        PrintFatalError(CurInstr->getLoc(), "No value " +
-                        RowField->getAsString() + " found in \"" +
-                        CurInstr->getName() + "\" instruction description.");
+        PrintFatalError(CurInstr->getLoc(),
+                        "No value " + RowField->getAsString() + " found in \"" +
+                            CurInstr->getName() +
+                            "\" instruction description.");
       Init *CurInstrVal = RecVal->getValue();
       KeyValue.push_back(CurInstrVal);
     }
@@ -256,14 +254,14 @@ void MapTableEmitter::buildRowInstrMap() {
 // Return true if an instruction is a KeyCol instruction.
 //===----------------------------------------------------------------------===//
 
-bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
+bool MapTableEmitter::isKeyColInstr(Record *CurInstr) {
   ListInit *ColFields = InstrMapDesc.getColFields();
   ListInit *KeyCol = InstrMapDesc.getKeyCol();
 
   // Check if the instruction is a KeyCol instruction.
   bool MatchFound = true;
-  for (unsigned j = 0, endCF = ColFields->size();
-      (j < endCF) && MatchFound; j++) {
+  for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound;
+       j++) {
     RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j));
     std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString();
     std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString();
@@ -280,10 +278,10 @@ bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
 void MapTableEmitter::buildMapTable() {
   // Find column instructions for a given key based on the ColField
   // constraints.
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
   unsigned NumOfCols = ValueCols.size();
   for (Record *CurKeyInstr : KeyInstrVec) {
-    std::vector<Record*> ColInstrVec(NumOfCols);
+    std::vector<Record *> ColInstrVec(NumOfCols);
 
     // Find the column instruction based on the constraints for the column.
     for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) {
@@ -302,7 +300,7 @@ void MapTableEmitter::buildMapTable() {
 Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
                                            ListInit *CurValueCol) {
   ListInit *RowFields = InstrMapDesc.getRowFields();
-  std::vector<Init*> KeyValue;
+  std::vector<Init *> KeyValue;
 
   // Construct KeyValue using KeyInstr's values for RowFields.
   for (Init *RowField : RowFields->getValues()) {
@@ -314,15 +312,15 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
   // in RowInstrMap. We search through these instructions to find a match
   // for the current column, i.e., the instruction which has the same values
   // as CurValueCol for all the fields in ColFields.
-  const std::vector<Record*> &RelatedInstrVec = RowInstrMap[KeyValue];
+  const std::vector<Record *> &RelatedInstrVec = RowInstrMap[KeyValue];
 
   ListInit *ColFields = InstrMapDesc.getColFields();
   Record *MatchInstr = nullptr;
 
   for (llvm::Record *CurInstr : RelatedInstrVec) {
     bool MatchFound = true;
-    for (unsigned j = 0, endCF = ColFields->size();
-         (j < endCF) && MatchFound; j++) {
+    for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound;
+         j++) {
       Init *ColFieldJ = ColFields->getElement(j);
       Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue();
       std::string CurInstrVal = CurInstrInit->getAsUnquotedString();
@@ -360,21 +358,21 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
 
 unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
 
-  ArrayRef<const CodeGenInstruction*> NumberedInstructions =
-                                            Target.getInstructionsByEnumValue();
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
   StringRef Namespace = Target.getInstNamespace();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
   unsigned NumCol = ValueCols.size();
   unsigned TotalNumInstr = NumberedInstructions.size();
   unsigned TableSize = 0;
 
-  OS << "static const uint16_t "<<InstrMapDesc.getName();
+  OS << "static const uint16_t " << InstrMapDesc.getName();
   // Number of columns in the table are NumCol+1 because key instructions are
   // emitted as first column.
-  OS << "Table[]["<< NumCol+1 << "] = {\n";
+  OS << "Table[][" << NumCol + 1 << "] = {\n";
   for (unsigned i = 0; i < TotalNumInstr; i++) {
     Record *CurInstr = NumberedInstructions[i]->TheDef;
-    std::vector<Record*> ColInstrs = MapTable[CurInstr];
+    std::vector<Record *> ColInstrs = MapTable[CurInstr];
     std::string OutStr;
     unsigned RelExists = 0;
     if (!ColInstrs.empty()) {
@@ -385,19 +383,23 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
           OutStr += Namespace;
           OutStr += "::";
           OutStr += ColInstrs[j]->getName();
-        } else { OutStr += ", (uint16_t)-1U";}
+        } else {
+          OutStr += ", (uint16_t)-1U";
+        }
       }
 
       if (RelExists) {
         OS << "  { " << Namespace << "::" << CurInstr->getName();
-        OS << OutStr <<" },\n";
+        OS << OutStr << " },\n";
         TableSize++;
       }
     }
   }
   if (!TableSize) {
-    OS << "  { " << Namespace << "::" << "INSTRUCTION_LIST_END, ";
-    OS << Namespace << "::" << "INSTRUCTION_LIST_END }";
+    OS << "  { " << Namespace << "::"
+       << "INSTRUCTION_LIST_END, ";
+    OS << Namespace << "::"
+       << "INSTRUCTION_LIST_END }";
   }
   OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n";
   return TableSize;
@@ -430,11 +432,10 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) {
 // Emit functions to query relation tables.
 //===----------------------------------------------------------------------===//
 
-void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
-                                           unsigned TableSize) {
+void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) {
 
   ListInit *ColFields = InstrMapDesc.getColFields();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
 
   // Emit binary search algorithm to locate instructions in the
   // relation table. If found, return opcode value from the appropriate column
@@ -455,14 +456,13 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
       }
       OS << ")\n";
       OS << "    return " << InstrMapDesc.getName();
-      OS << "Table[mid]["<<i+1<<"];\n";
+      OS << "Table[mid][" << i + 1 << "];\n";
     }
     OS << "  return -1;";
-  }
-  else
+  } else
     OS << "  return " << InstrMapDesc.getName() << "Table[mid][1];\n";
 
-  OS <<"}\n\n";
+  OS << "}\n\n";
 }
 
 //===----------------------------------------------------------------------===//
@@ -477,9 +477,9 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
   // to pass another input to indicate the column to be selected.
 
   ListInit *ColFields = InstrMapDesc.getColFields();
-  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
-  OS << "// "<< InstrMapDesc.getName() << "\nLLVM_READONLY\n";
-  OS << "int "<< InstrMapDesc.getName() << "(uint16_t Opcode";
+  const std::vector<ListInit *> &ValueCols = InstrMapDesc.getValueCols();
+  OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n";
+  OS << "int " << InstrMapDesc.getName() << "(uint16_t Opcode";
   if (ValueCols.size() > 1) {
     for (Init *CF : ColFields->getValues()) {
       std::string ColName = CF->getAsUnquotedString();
@@ -501,9 +501,9 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
 
 static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
 
-  std::vector<Record*> InstrMapVec;
+  std::vector<Record *> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
-  std::map<std::string, std::vector<Init*> > ColFieldValueMap;
+  std::map<std::string, std::vector<Init *>> ColFieldValueMap;
 
   // Iterate over all InstrMapping records and create a map between column
   // fields and their possible values across all records.
@@ -511,20 +511,22 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
     ListInit *ColFields;
     ColFields = CurMap->getValueAsListInit("ColFields");
     ListInit *List = CurMap->getValueAsListInit("ValueCols");
-    std::vector<ListInit*> ValueCols;
+    std::vector<ListInit *> ValueCols;
     unsigned ListSize = List->size();
 
     for (unsigned j = 0; j < ListSize; j++) {
       auto *ListJ = cast<ListInit>(List->getElement(j));
 
       if (ListJ->size() != ColFields->size())
-        PrintFatalError("Record `" + CurMap->getName() + "', field "
-          "`ValueCols' entries don't match with the entries in 'ColFields' !");
+        PrintFatalError("Record `" + CurMap->getName() +
+                        "', field "
+                        "`ValueCols' entries don't match with the entries in "
+                        "'ColFields' !");
       ValueCols.push_back(ListJ);
     }
 
     for (unsigned j = 0, endCF = ColFields->size(); j < endCF; j++) {
-      for (unsigned k = 0; k < ListSize; k++){
+      for (unsigned k = 0; k < ListSize; k++) {
         std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
         ColFieldValueMap[ColName].push_back((ValueCols[k])->getElement(j));
       }
@@ -532,14 +534,14 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
   }
 
   for (auto &Entry : ColFieldValueMap) {
-    std::vector<Init*> FieldValues = Entry.second;
+    std::vector<Init *> FieldValues = Entry.second;
 
     // Delete duplicate entries from ColFieldValueMap
     for (unsigned i = 0; i < FieldValues.size() - 1; i++) {
       Init *CurVal = FieldValues[i];
-      for (unsigned j = i+1; j < FieldValues.size(); j++) {
+      for (unsigned j = i + 1; j < FieldValues.size(); j++) {
         if (CurVal == FieldValues[j]) {
-          FieldValues.erase(FieldValues.begin()+j);
+          FieldValues.erase(FieldValues.begin() + j);
           --j;
         }
       }
@@ -566,7 +568,7 @@ namespace llvm {
 void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   CodeGenTarget Target(Records);
   StringRef NameSpace = Target.getInstNamespace();
-  std::vector<Record*> InstrMapVec;
+  std::vector<Record *> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
 
   if (InstrMapVec.empty())
@@ -603,4 +605,4 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#endif // GET_INSTRMAP_INFO\n\n";
 }
 
-} // End llvm namespace
+} // namespace llvm
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index d1abdb74ea4a..7d266c8896d8 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -48,7 +48,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
-  : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
+    : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
   Name = std::string(R->getName());
   if (R->getValue("Namespace"))
     Namespace = std::string(R->getValueAsString("Namespace"));
@@ -74,7 +74,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
   if (!TheDef)
     return;
 
-  std::vector<Record*> Comps = TheDef->getValueAsListOfDefs("ComposedOf");
+  std::vector<Record *> Comps = TheDef->getValueAsListOfDefs("ComposedOf");
   if (!Comps.empty()) {
     if (Comps.size() != 2)
       PrintFatalError(TheDef->getLoc(),
@@ -86,13 +86,13 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
       PrintFatalError(TheDef->getLoc(), "Ambiguous ComposedOf entries");
   }
 
-  std::vector<Record*> Parts =
-    TheDef->getValueAsListOfDefs("CoveringSubRegIndices");
+  std::vector<Record *> Parts =
+      TheDef->getValueAsListOfDefs("CoveringSubRegIndices");
   if (!Parts.empty()) {
     if (Parts.size() < 2)
       PrintFatalError(TheDef->getLoc(),
                       "CoveredBySubRegs must have two or more entries");
-    SmallVector<CodeGenSubRegIndex*, 8> IdxParts;
+    SmallVector<CodeGenSubRegIndex *, 8> IdxParts;
     for (Record *Part : Parts)
       IdxParts.push_back(RegBank.getSubRegIdx(Part));
     setConcatenationOf(IdxParts);
@@ -117,17 +117,19 @@ LaneBitmask CodeGenSubRegIndex::computeLaneMask() const {
 }
 
 void CodeGenSubRegIndex::setConcatenationOf(
-    ArrayRef<CodeGenSubRegIndex*> Parts) {
+    ArrayRef<CodeGenSubRegIndex *> Parts) {
   if (ConcatenationOf.empty())
     ConcatenationOf.assign(Parts.begin(), Parts.end());
   else
-    assert(std::equal(Parts.begin(), Parts.end(),
-                      ConcatenationOf.begin()) && "parts consistent");
+    assert(std::equal(Parts.begin(), Parts.end(), ConcatenationOf.begin()) &&
+           "parts consistent");
 }
 
 void CodeGenSubRegIndex::computeConcatTransitiveClosure() {
-  for (SmallVectorImpl<CodeGenSubRegIndex*>::iterator
-       I = ConcatenationOf.begin(); I != ConcatenationOf.end(); /*empty*/) {
+  for (SmallVectorImpl<CodeGenSubRegIndex *>::iterator I =
+           ConcatenationOf.begin();
+       I != ConcatenationOf.end();
+       /*empty*/) {
     CodeGenSubRegIndex *SubIdx = *I;
     SubIdx->computeConcatTransitiveClosure();
 #ifndef NDEBUG
@@ -160,8 +162,8 @@ CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
 }
 
 void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
-  std::vector<Record*> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
-  std::vector<Record*> SRs = TheDef->getValueAsListOfDefs("SubRegs");
+  std::vector<Record *> SRIs = TheDef->getValueAsListOfDefs("SubRegIndices");
+  std::vector<Record *> SRs = TheDef->getValueAsListOfDefs("SubRegs");
 
   if (SRIs.size() != SRs.size())
     PrintFatalError(TheDef->getLoc(),
@@ -182,7 +184,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
 
   // Add ad hoc alias links. This is a symmetric relationship between two
   // registers, so build a symmetric graph by adding links in both ends.
-  std::vector<Record*> Aliases = TheDef->getValueAsListOfDefs("Aliases");
+  std::vector<Record *> Aliases = TheDef->getValueAsListOfDefs("Aliases");
   for (Record *Alias : Aliases) {
     CodeGenRegister *Reg = RegBank.getReg(Alias);
     ExplicitAliases.push_back(Reg);
@@ -204,8 +206,8 @@ class RegUnitIterator {
   static CodeGenRegister::RegUnitList Sentinel;
 
 public:
-  RegUnitIterator(const CodeGenRegister::Vec &Regs):
-    RegI(Regs.begin()), RegE(Regs.end()) {
+  RegUnitIterator(const CodeGenRegister::Vec &Regs)
+      : RegI(Regs.begin()), RegE(Regs.end()) {
 
     if (RegI == RegE) {
       UnitI = Sentinel.end();
@@ -219,9 +221,15 @@ public:
 
   bool isValid() const { return UnitI != UnitE; }
 
-  unsigned operator* () const { assert(isValid()); return *UnitI; }
+  unsigned operator*() const {
+    assert(isValid());
+    return *UnitI;
+  }
 
-  const CodeGenRegister *getReg() const { assert(isValid()); return *RegI; }
+  const CodeGenRegister *getReg() const {
+    assert(isValid());
+    return *RegI;
+  }
 
   /// Preincrement.  Move to the next unit.
   void operator++() {
@@ -245,11 +253,6 @@ CodeGenRegister::RegUnitList RegUnitIterator::Sentinel;
 
 } // end anonymous namespace
 
-// Return true of this unit appears in RegUnits.
-static bool hasRegUnit(CodeGenRegister::RegUnitList &RegUnits, unsigned Unit) {
-  return RegUnits.test(Unit);
-}
-
 // Inherit register units from subregisters.
 // Return true if the RegUnits changed.
 bool CodeGenRegister::inheritRegUnits(CodeGenRegBank &RegBank) {
@@ -280,14 +283,15 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       Idx->Artificial = false;
     if (!SubRegs.insert(std::make_pair(Idx, SR)).second)
       PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
-                      " appears twice in Register " + getName());
+                                            " appears twice in Register " +
+                                            getName());
     // Map explicit sub-registers first, so the names take precedence.
     // The inherited sub-registers are mapped below.
     SubReg2Idx.insert(std::make_pair(SR, Idx));
   }
 
   // Keep track of inherited subregs and how they can be reached.
-  SmallPtrSet<CodeGenRegister*, 8> Orphans;
+  SmallPtrSet<CodeGenRegister *, 8> Orphans;
 
   // Clone inherited subregs and place duplicate entries in Orphans.
   // Here the order is important - earlier subregs take precedence.
@@ -305,7 +309,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   // If dsub_2 has ComposedOf = [qsub_1, dsub_0], and this register has a
   // qsub_1 subreg, add a dsub_2 subreg.  Keep growing Indices and process
   // expanded subreg indices recursively.
-  SmallVector<CodeGenSubRegIndex*, 8> Indices = ExplicitSubRegIndices;
+  SmallVector<CodeGenSubRegIndex *, 8> Indices = ExplicitSubRegIndices;
   for (unsigned i = 0; i != Indices.size(); ++i) {
     CodeGenSubRegIndex *Idx = Indices[i];
     const CodeGenSubRegIndex::CompMap &Comps = Idx->getComposites();
@@ -350,7 +354,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     const SubRegMap &Map = SR->computeSubRegs(RegBank);
     for (const auto &SubReg : Map)
       if (Orphans.erase(SubReg.second))
-        SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] = SubReg.second;
+        SubRegs[RegBank.getCompositeSubRegIndex(Idx, SubReg.first)] =
+            SubReg.second;
   }
 
   // Compute the inverse SubReg -> Idx map.
@@ -360,7 +365,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       if (TheDef)
         Loc = TheDef->getLoc();
       PrintFatalError(Loc, "Register " + getName() +
-                      " has itself as a sub-register");
+                               " has itself as a sub-register");
     }
 
     // Compute AllSuperRegsCovered.
@@ -368,17 +373,18 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       SubReg.first->AllSuperRegsCovered = false;
 
     // Ensure that every sub-register has a unique name.
-    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*>::iterator Ins =
-      SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first;
+    DenseMap<const CodeGenRegister *, CodeGenSubRegIndex *>::iterator Ins =
+        SubReg2Idx.insert(std::make_pair(SubReg.second, SubReg.first)).first;
     if (Ins->second == SubReg.first)
       continue;
     // Trouble: Two different names for SubReg.second.
     ArrayRef<SMLoc> Loc;
     if (TheDef)
       Loc = TheDef->getLoc();
-    PrintFatalError(Loc, "Sub-register can't have two names: " +
-                  SubReg.second->getName() + " available as " +
-                  SubReg.first->getName() + " and " + Ins->second->getName());
+    PrintFatalError(
+        Loc, "Sub-register can't have two names: " + SubReg.second->getName() +
+                 " available as " + SubReg.first->getName() + " and " +
+                 Ins->second->getName());
   }
 
   // Derive possible names for sub-register concatenations from any explicit
@@ -392,7 +398,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       continue;
 
     // SR is composed of multiple sub-regs. Find their names in this register.
-    SmallVector<CodeGenSubRegIndex*, 8> Parts;
+    SmallVector<CodeGenSubRegIndex *, 8> Parts;
     for (unsigned j = 0, e = SR->ExplicitSubRegs.size(); j != e; ++j) {
       CodeGenSubRegIndex &I = *SR->ExplicitSubRegIndices[j];
       if (!I.Artificial)
@@ -464,8 +470,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
 void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
   SmallVector<SubRegMap::value_type, 8> NewSubRegs;
 
-  std::queue<std::pair<CodeGenSubRegIndex*,CodeGenRegister*>> SubRegQueue;
-  for (std::pair<CodeGenSubRegIndex*,CodeGenRegister*> P : SubRegs)
+  std::queue<std::pair<CodeGenSubRegIndex *, CodeGenRegister *>> SubRegQueue;
+  for (std::pair<CodeGenSubRegIndex *, CodeGenRegister *> P : SubRegs)
     SubRegQueue.push(P);
 
   // Look at the leading super-registers of each sub-register. Those are the
@@ -479,7 +485,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
 
     const CodeGenRegister::SuperRegList &Leads = SubReg->LeadingSuperRegs;
     for (unsigned i = 0, e = Leads.size(); i != e; ++i) {
-      CodeGenRegister *Cand = const_cast<CodeGenRegister*>(Leads[i]);
+      CodeGenRegister *Cand = const_cast<CodeGenRegister *>(Leads[i]);
       // Already got this sub-register?
       if (Cand == this || getSubRegIndex(Cand))
         continue;
@@ -488,7 +494,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
              "Super-register has no sub-registers");
       if (Cand->ExplicitSubRegs.size() == 1)
         continue;
-      SmallVector<CodeGenSubRegIndex*, 8> Parts;
+      SmallVector<CodeGenSubRegIndex *, 8> Parts;
       // We know that the first component is (SubRegIdx,SubReg). However we
       // may still need to split it into smaller subregister parts.
       assert(Cand->ExplicitSubRegs[0] == SubReg && "LeadingSuperRegs correct");
@@ -513,7 +519,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
       // Each part of Cand is a sub-register of this. Make the full Cand also
       // a sub-register with a concatenated sub-register index.
       CodeGenSubRegIndex *Concat = RegBank.getConcatSubRegIndex(Parts);
-      std::pair<CodeGenSubRegIndex*,CodeGenRegister*> NewSubReg =
+      std::pair<CodeGenSubRegIndex *, CodeGenRegister *> NewSubReg =
           std::make_pair(Concat, Cand);
 
       if (!SubRegs.insert(NewSubReg).second)
@@ -570,9 +576,8 @@ void CodeGenRegister::computeSuperRegs(CodeGenRegBank &RegBank) {
   TopoSig = RegBank.getTopoSig(Id);
 }
 
-void
-CodeGenRegister::addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
-                                    CodeGenRegBank &RegBank) const {
+void CodeGenRegister::addSubRegsPreOrder(
+    SetVector<const CodeGenRegister *> &OSet, CodeGenRegBank &RegBank) const {
   assert(SubRegsComplete && "Must precompute sub-registers");
   for (unsigned i = 0, e = ExplicitSubRegs.size(); i != e; ++i) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
@@ -611,7 +616,7 @@ struct TupleExpander : SetTheory::Expander {
       : SynthDefs(SynthDefs) {}
 
   void expand(SetTheory &ST, Record *Def, SetTheory::RecSet &Elts) override {
-    std::vector<Record*> Indices = Def->getValueAsListOfDefs("SubRegIndices");
+    std::vector<Record *> Indices = Def->getValueAsListOfDefs("SubRegIndices");
     unsigned Dim = Indices.size();
     ListInit *SubRegs = Def->getValueAsListInit("SubRegs");
     if (Dim != SubRegs->size())
@@ -635,17 +640,18 @@ struct TupleExpander : SetTheory::Expander {
     Record *RegisterCl = Def->getRecords().getClass("Register");
     RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl);
     std::vector<StringRef> RegNames =
-      Def->getValueAsListOfStrings("RegAsmNames");
+        Def->getValueAsListOfStrings("RegAsmNames");
 
     // Zip them up.
     RecordKeeper &RK = Def->getRecords();
     for (unsigned n = 0; n != Length; ++n) {
       std::string Name;
       Record *Proto = Lists[0][n];
-      std::vector<Init*> Tuple;
+      std::vector<Init *> Tuple;
       for (unsigned i = 0; i != Dim; ++i) {
         Record *Reg = Lists[i][n];
-        if (i) Name += '_';
+        if (i)
+          Name += '_';
         Name += Reg->getName();
         Tuple.push_back(DefInit::get(Reg));
       }
@@ -660,7 +666,7 @@ struct TupleExpander : SetTheory::Expander {
         if (RegNames.size() <= n)
           PrintFatalError(Def->getLoc(),
                           "Register tuple definition missing name for '" +
-                            Name + "'.");
+                              Name + "'.");
         AsmName = StringInit::get(RK, RegNames[n]);
       }
 
@@ -703,15 +709,13 @@ struct TupleExpander : SetTheory::Expander {
           RV.setValue(BitInit::get(RK, true));
 
         // Copy fields from the RegisterTuples def.
-        if (Field == "SubRegIndices" ||
-            Field == "CompositeIndices") {
+        if (Field == "SubRegIndices" || Field == "CompositeIndices") {
           NewReg->addValue(*Def->getValue(Field));
           continue;
         }
 
         // Some fields get their default uninitialized value.
-        if (Field == "DwarfNumbers" ||
-            Field == "DwarfAlias" ||
+        if (Field == "DwarfNumbers" || Field == "DwarfAlias" ||
             Field == "Aliases") {
           if (const RecordVal *DefRV = RegisterCl->getValue(Field))
             NewReg->addValue(*DefRV);
@@ -740,7 +744,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
     : TheDef(R), Name(std::string(R->getName())),
       TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) {
   GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
-  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
+  std::vector<Record *> TypeList = R->getValueAsListOfDefs("RegTypes");
   if (TypeList.empty())
     PrintFatalError(R->getLoc(), "RegTypes list must not be empty!");
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
@@ -779,7 +783,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
       Order.pop_back();
       if (!contains(Reg))
         PrintFatalError(R->getLoc(), " AltOrder register " + Reg->getName() +
-                      " is not a class member");
+                                         " is not a class member");
     }
   }
 
@@ -793,8 +797,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
          "Impossible to determine register size");
   if (!RSI.hasDefault()) {
     RegSizeInfo RI;
-    RI.RegSize = RI.SpillSize = Size ? Size
-                                     : VTs[0].getSimple().getSizeInBits();
+    RI.RegSize = RI.SpillSize =
+        Size ? Size : VTs[0].getSimple().getSizeInBits();
     RI.SpillAlignment = R->getValueAsInt("Alignment");
     RSI.insertRegSizeForMode(DefaultMode, RI);
   }
@@ -890,7 +894,7 @@ bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
                             deref<std::less<>>());
 }
 
-unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const {
+unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const {
   if (TheDef && !TheDef->isValueUnset("Weight"))
     return TheDef->getValueAsInt("Weight");
 
@@ -902,19 +906,19 @@ unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank& RegBank) const {
 
 namespace llvm {
 
-  raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
-    OS << "{ " << K.RSI;
-    for (const auto R : *K.Members)
-      OS << ", " << R->getName();
-    return OS << " }";
-  }
+raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
+  OS << "{ " << K.RSI;
+  for (const auto R : *K.Members)
+    OS << ", " << R->getName();
+  return OS << " }";
+}
 
 } // end namespace llvm
 
 // This is a simple lexicographical order that can be used to search for sets.
 // It is not the same as the topological order provided by TopoOrderRC.
-bool CodeGenRegisterClass::Key::
-operator<(const CodeGenRegisterClass::Key &B) const {
+bool CodeGenRegisterClass::Key::operator<(
+    const CodeGenRegisterClass::Key &B) const {
   assert(Members && B.Members);
   return std::tie(*Members, RSI) < std::tie(*B.Members, B.RSI);
 }
@@ -1066,7 +1070,7 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
 
   // Find all the subreg classes and order them by size too.
   std::vector<std::pair<CodeGenRegisterClass *, BitVector>> SuperRegClasses;
-  for (auto &RC: RegClasses) {
+  for (auto &RC : RegClasses) {
     BitVector SuperRegClassesBV(RegClasses.size());
     RC.getSuperRegClasses(SubIdx, SuperRegClassesBV);
     if (SuperRegClassesBV.any())
@@ -1129,8 +1133,8 @@ void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
 }
 
 // Populate a unique sorted list of units from a register set.
-void CodeGenRegisterClass::buildRegUnitSet(const CodeGenRegBank &RegBank,
-  std::vector<unsigned> &RegUnits) const {
+void CodeGenRegisterClass::buildRegUnitSet(
+    const CodeGenRegBank &RegBank, std::vector<unsigned> &RegUnits) const {
   std::vector<unsigned> TmpUnits;
   for (RegUnitIterator UnitI(Members); UnitI.isValid(); ++UnitI) {
     const RegUnit &RU = RegBank.getRegUnit(*UnitI);
@@ -1158,7 +1162,8 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
 //===----------------------------------------------------------------------===//
 
 CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
-                               const CodeGenHwModes &Modes) : CGH(Modes) {
+                               const CodeGenHwModes &Modes)
+    : CGH(Modes) {
   // Configure register Sets to understand register classes and tuples.
   Sets.addFieldExpander("RegisterClass", "MemberList");
   Sets.addFieldExpander("CalleeSavedRegs", "SaveList");
@@ -1167,7 +1172,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 
   // Read in the user-defined (named) sub-register indices.
   // More indices will be synthesized later.
-  std::vector<Record*> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
+  std::vector<Record *> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
   llvm::sort(SRIs, LessRecord());
   for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
     getSubRegIdx(SRIs[i]);
@@ -1238,8 +1243,9 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
     SRI.computeConcatTransitiveClosure();
     if (!SRI.ConcatenationOf.empty())
       ConcatIdx.insert(std::make_pair(
-          SmallVector<CodeGenSubRegIndex*,8>(SRI.ConcatenationOf.begin(),
-                                             SRI.ConcatenationOf.end()), &SRI));
+          SmallVector<CodeGenSubRegIndex *, 8>(SRI.ConcatenationOf.begin(),
+                                               SRI.ConcatenationOf.end()),
+          &SRI));
   }
 
   // Infer even more sub-registers by combining leading super-registers.
@@ -1269,7 +1275,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   NumNativeRegUnits = RegUnits.size();
 
   // Read in register class definitions.
-  std::vector<Record*> RCs = Records.getAllDerivedDefinitions("RegisterClass");
+  std::vector<Record *> RCs = Records.getAllDerivedDefinitions("RegisterClass");
   if (RCs.empty())
     PrintFatalError("No 'RegisterClass' subclasses defined!");
 
@@ -1299,8 +1305,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
 }
 
 // Create a synthetic CodeGenSubRegIndex without a corresponding Record.
-CodeGenSubRegIndex*
-CodeGenRegBank::createSubRegIndex(StringRef Name, StringRef Namespace) {
+CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name,
+                                                      StringRef Namespace) {
   SubRegIndices.emplace_back(Name, Namespace, SubRegIndices.size() + 1);
   return &SubRegIndices.back();
 }
@@ -1315,7 +1321,7 @@ CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
 }
 
 const CodeGenSubRegIndex *
-CodeGenRegBank::findSubRegIdx(const Record* Def) const {
+CodeGenRegBank::findSubRegIdx(const Record *Def) const {
   return Def2SubRegIdx.lookup(Def);
 }
 
@@ -1339,7 +1345,7 @@ void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) {
 }
 
 // Create a synthetic sub-class if it is missing.
-CodeGenRegisterClass*
+CodeGenRegisterClass *
 CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC,
                                     const CodeGenRegister::Vec *Members,
                                     StringRef Name) {
@@ -1362,7 +1368,7 @@ CodeGenRegisterClass *CodeGenRegBank::getRegClass(const Record *Def) const {
   PrintFatalError(Def->getLoc(), "Not a known RegisterClass!");
 }
 
-CodeGenSubRegIndex*
+CodeGenSubRegIndex *
 CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A,
                                         CodeGenSubRegIndex *B) {
   // Look for an existing entry.
@@ -1377,8 +1383,8 @@ CodeGenRegBank::getCompositeSubRegIndex(CodeGenSubRegIndex *A,
   return Comp;
 }
 
-CodeGenSubRegIndex *CodeGenRegBank::
-getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
+CodeGenSubRegIndex *CodeGenRegBank::getConcatSubRegIndex(
+    const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
   assert(Parts.size() > 1 && "Need two parts to concatenate");
 #ifndef NDEBUG
   for (CodeGenSubRegIndex *Idx : Parts) {
@@ -1419,26 +1425,26 @@ getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &Parts) {
 }
 
 void CodeGenRegBank::computeComposites() {
-  using RegMap = std::map<const CodeGenRegister*, const CodeGenRegister*>;
+  using RegMap = std::map<const CodeGenRegister *, const CodeGenRegister *>;
 
   // Subreg -> { Reg->Reg }, where the right-hand side is the mapping from
   // register to (sub)register associated with the action of the left-hand
   // side subregister.
-  std::map<const CodeGenSubRegIndex*, RegMap> SubRegAction;
+  std::map<const CodeGenSubRegIndex *, RegMap> SubRegAction;
   for (const CodeGenRegister &R : Registers) {
     const CodeGenRegister::SubRegMap &SM = R.getSubRegs();
-    for (std::pair<const CodeGenSubRegIndex*, const CodeGenRegister*> P : SM)
+    for (std::pair<const CodeGenSubRegIndex *, const CodeGenRegister *> P : SM)
       SubRegAction[P.first].insert({&R, P.second});
   }
 
   // Calculate the composition of two subregisters as compositions of their
   // associated actions.
-  auto compose = [&SubRegAction] (const CodeGenSubRegIndex *Sub1,
-                                  const CodeGenSubRegIndex *Sub2) {
+  auto compose = [&SubRegAction](const CodeGenSubRegIndex *Sub1,
+                                 const CodeGenSubRegIndex *Sub2) {
     RegMap C;
     const RegMap &Img1 = SubRegAction.at(Sub1);
     const RegMap &Img2 = SubRegAction.at(Sub2);
-    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Img1) {
+    for (std::pair<const CodeGenRegister *, const CodeGenRegister *> P : Img1) {
       auto F = Img2.find(P.second);
       if (F != Img2.end())
         C.insert({P.first, F->second});
@@ -1447,13 +1453,13 @@ void CodeGenRegBank::computeComposites() {
   };
 
   // Check if the two maps agree on the intersection of their domains.
-  auto agree = [] (const RegMap &Map1, const RegMap &Map2) {
+  auto agree = [](const RegMap &Map1, const RegMap &Map2) {
     // Technically speaking, an empty map agrees with any other map, but
     // this could flag false positives. We're interested in non-vacuous
     // agreements.
     if (Map1.empty() || Map2.empty())
       return false;
-    for (std::pair<const CodeGenRegister*, const CodeGenRegister*> P : Map1) {
+    for (std::pair<const CodeGenRegister *, const CodeGenRegister *> P : Map1) {
       auto F = Map2.find(P.first);
       if (F == Map2.end() || P.second != F->second)
         return false;
@@ -1461,9 +1467,9 @@ void CodeGenRegBank::computeComposites() {
     return true;
   };
 
-  using CompositePair = std::pair<const CodeGenSubRegIndex*,
-                                  const CodeGenSubRegIndex*>;
-  SmallSet<CompositePair,4> UserDefined;
+  using CompositePair =
+      std::pair<const CodeGenSubRegIndex *, const CodeGenSubRegIndex *>;
+  SmallSet<CompositePair, 4> UserDefined;
   for (const CodeGenSubRegIndex &Idx : SubRegIndices)
     for (auto P : Idx.getComposites())
       UserDefined.insert(std::make_pair(&Idx, P.first));
@@ -1528,8 +1534,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     if (Idx.getComposites().empty()) {
       if (Bit > LaneBitmask::BitWidth) {
         PrintFatalError(
-          Twine("Ran out of lanemask bits to represent subregister ")
-          + Idx.getName());
+            Twine("Ran out of lanemask bits to represent subregister ") +
+            Idx.getName());
       }
       Idx.LaneMask = LaneBitmask::getLane(Bit);
       ++Bit;
@@ -1556,7 +1562,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
       unsigned DstBit = Idx.LaneMask.getHighestLane();
       assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) &&
              "Must be a leaf subregister");
-      MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit };
+      MaskRolPair MaskRol = {LaneBitmask::getLane(0), (uint8_t)DstBit};
       LaneTransforms.push_back(MaskRol);
     } else {
       // Go through all leaf subregisters and find the ones that compose with
@@ -1571,7 +1577,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
         // Replicate the behaviour from the lane mask generation loop above.
         unsigned SrcBit = NextBit;
         LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit);
-        if (NextBit < LaneBitmask::BitWidth-1)
+        if (NextBit < LaneBitmask::BitWidth - 1)
           ++NextBit;
         assert(Idx2.LaneMask == SrcMask);
 
@@ -1586,8 +1592,8 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
         // Create Mask+Rotate operation and merge with existing ops if possible.
         unsigned DstBit = Composite->LaneMask.getHighestLane();
         int Shift = DstBit - SrcBit;
-        uint8_t RotateLeft = Shift >= 0 ? (uint8_t)Shift
-                                        : LaneBitmask::BitWidth + Shift;
+        uint8_t RotateLeft =
+            Shift >= 0 ? (uint8_t)Shift : LaneBitmask::BitWidth + Shift;
         for (auto &I : LaneTransforms) {
           if (I.RotateLeft == RotateLeft) {
             I.Mask |= SrcMask;
@@ -1595,7 +1601,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
           }
         }
         if (SrcMask.any()) {
-          MaskRolPair MaskRol = { SrcMask, RotateLeft };
+          MaskRolPair MaskRol = {SrcMask, RotateLeft};
           LaneTransforms.push_back(MaskRol);
         }
       }
@@ -1611,7 +1617,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     // in a sequence with 0 entries we can just pick any other. Choose
     // Mask 0xffffffff with Rotation 0.
     if (LaneTransforms.size() == 0) {
-      MaskRolPair P = { LaneBitmask::getAll(), 0 };
+      MaskRolPair P = {LaneBitmask::getAll(), 0};
       LaneTransforms.push_back(P);
     }
   }
@@ -1679,7 +1685,7 @@ struct UberRegSet {
 //
 // UberRegSets[0] is a special non-allocatable set.
 static void computeUberSets(std::vector<UberRegSet> &UberSets,
-                            std::vector<UberRegSet*> &RegSets,
+                            std::vector<UberRegSet *> &RegSets,
                             CodeGenRegBank &RegBank) {
   const auto &Registers = RegBank.getRegisters();
 
@@ -1742,7 +1748,8 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
                                CodeGenRegBank &RegBank) {
   // Skip the first unallocatable set.
   for (std::vector<UberRegSet>::iterator I = std::next(UberSets.begin()),
-         E = UberSets.end(); I != E; ++I) {
+                                         E = UberSets.end();
+       I != E; ++I) {
 
     // Initialize all unit weights in this set, and remember the max units/reg.
     const CodeGenRegister *Reg = nullptr;
@@ -1797,7 +1804,7 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
 // - induces recomputation of UberWeights.
 static bool normalizeWeight(CodeGenRegister *Reg,
                             std::vector<UberRegSet> &UberSets,
-                            std::vector<UberRegSet*> &RegSets,
+                            std::vector<UberRegSet *> &RegSets,
                             BitVector &NormalRegs,
                             CodeGenRegister::RegUnitList &NormalUnits,
                             CodeGenRegBank &RegBank) {
@@ -1830,15 +1837,13 @@ static bool normalizeWeight(CodeGenRegister *Reg,
     // for this register, has not been used to normalize a subregister's set,
     // and has not already been used to singularly determine this UberRegSet.
     unsigned AdjustUnit = *Reg->getRegUnits().begin();
-    if (Reg->getRegUnits().count() != 1
-        || hasRegUnit(NormalUnits, AdjustUnit)
-        || hasRegUnit(UberSet->SingularDeterminants, AdjustUnit)) {
+    if (Reg->getRegUnits().count() != 1 || NormalUnits.test(AdjustUnit) ||
+        UberSet->SingularDeterminants.test(AdjustUnit)) {
       // We don't have an adjustable unit, so adopt a new one.
       AdjustUnit = RegBank.newRegUnit(UberSet->Weight - RegWeight);
       Reg->adoptRegUnit(AdjustUnit);
       // Adopting a unit does not immediately require recomputing set weights.
-    }
-    else {
+    } else {
       // Adjust the existing single unit.
       if (!RegBank.getRegUnit(AdjustUnit).Artificial)
         RegBank.increaseRegUnitWeight(AdjustUnit, UberSet->Weight - RegWeight);
@@ -1860,7 +1865,7 @@ static bool normalizeWeight(CodeGenRegister *Reg,
 // where each register's weight is defined as sum of its units' weights.
 void CodeGenRegBank::computeRegUnitWeights() {
   std::vector<UberRegSet> UberSets;
-  std::vector<UberRegSet*> RegSets(Registers.size());
+  std::vector<UberRegSet *> RegSets(Registers.size());
   computeUberSets(UberSets, RegSets, *this);
   // UberSets and RegSets are now immutable.
 
@@ -1871,7 +1876,7 @@ void CodeGenRegBank::computeRegUnitWeights() {
   unsigned NumIters = 0;
   for (bool Changed = true; Changed; ++NumIters) {
     assert(NumIters <= NumNativeRegUnits && "Runaway register unit weights");
-    (void) NumIters;
+    (void)NumIters;
     Changed = false;
     for (auto &Reg : Registers) {
       CodeGenRegister::RegUnitList NormalUnits;
@@ -1887,20 +1892,15 @@ void CodeGenRegBank::computeRegUnitWeights() {
 static std::vector<RegUnitSet>::const_iterator
 findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
                const RegUnitSet &Set) {
-  std::vector<RegUnitSet>::const_iterator
-    I = UniqueSets.begin(), E = UniqueSets.end();
-  for(;I != E; ++I) {
-    if (I->Units == Set.Units)
-      break;
-  }
-  return I;
+  return find_if(UniqueSets,
+                 [&Set](const RegUnitSet &I) { return I.Units == Set.Units; });
 }
 
 // Return true if the RUSubSet is a subset of RUSuperSet.
 static bool isRegUnitSubSet(const std::vector<unsigned> &RUSubSet,
                             const std::vector<unsigned> &RUSuperSet) {
-  return std::includes(RUSuperSet.begin(), RUSuperSet.end(),
-                       RUSubSet.begin(), RUSubSet.end());
+  return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(),
+                       RUSubSet.end());
 }
 
 /// Iteratively prune unit sets. Prune subsets that are close to the superset,
@@ -1925,8 +1925,8 @@ void CodeGenRegBank::pruneUnitSets() {
 
   // Form an equivalence class of UnitSets with no significant difference.
   std::vector<unsigned> SuperSetIDs;
-  for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size();
-       SubIdx != EndIdx; ++SubIdx) {
+  for (unsigned SubIdx = 0, EndIdx = RegUnitSets.size(); SubIdx != EndIdx;
+       ++SubIdx) {
     const RegUnitSet &SubSet = RegUnitSets[SubIdx];
     unsigned SuperIdx = 0;
     for (; SuperIdx != EndIdx; ++SuperIdx) {
@@ -1935,10 +1935,10 @@ void CodeGenRegBank::pruneUnitSets() {
 
       unsigned UnitWeight = RegUnits[SubSet.Units[0]].Weight;
       const RegUnitSet &SuperSet = RegUnitSets[SuperIdx];
-      if (isRegUnitSubSet(SubSet.Units, SuperSet.Units)
-          && (SubSet.Units.size() + 3 > SuperSet.Units.size())
-          && UnitWeight == RegUnits[SuperSet.Units[0]].Weight
-          && UnitWeight == RegUnits[SuperSet.Units.back()].Weight) {
+      if (isRegUnitSubSet(SubSet.Units, SuperSet.Units) &&
+          (SubSet.Units.size() + 3 > SuperSet.Units.size()) &&
+          UnitWeight == RegUnits[SuperSet.Units[0]].Weight &&
+          UnitWeight == RegUnits[SuperSet.Units.back()].Weight) {
         LLVM_DEBUG(dbgs() << "UnitSet " << SubIdx << " subsumed by " << SuperIdx
                           << "\n");
         // We can pick any of the set names for the merged set. Go for the
@@ -1958,9 +1958,9 @@ void CodeGenRegBank::pruneUnitSets() {
   for (unsigned i = 0, e = SuperSetIDs.size(); i != e; ++i) {
     unsigned SuperIdx = SuperSetIDs[i];
     PrunedUnitSets[i].Name = RegUnitSets[SuperIdx].Name;
-    PrunedUnitSets[i].Units.swap(RegUnitSets[SuperIdx].Units);
+    PrunedUnitSets[i].Units = std::move(RegUnitSets[SuperIdx].Units);
   }
-  RegUnitSets.swap(PrunedUnitSets);
+  RegUnitSets = std::move(PrunedUnitSets);
 }
 
 // Create a RegUnitSet for each RegClass that contains all units in the class
@@ -1979,18 +1979,14 @@ void CodeGenRegBank::computeRegUnitSets() {
     if (!RC.Allocatable || RC.Artificial || !RC.GeneratePressureSet)
       continue;
 
-    // Speculatively grow the RegUnitSets to hold the new set.
-    RegUnitSets.resize(RegUnitSets.size() + 1);
-    RegUnitSets.back().Name = RC.getName();
-
     // Compute a sorted list of units in this class.
-    RC.buildRegUnitSet(*this, RegUnitSets.back().Units);
+    RegUnitSet RUSet;
+    RUSet.Name = RC.getName();
+    RC.buildRegUnitSet(*this, RUSet.Units);
 
     // Find an existing RegUnitSet.
-    std::vector<RegUnitSet>::const_iterator SetI =
-      findRegUnitSet(RegUnitSets, RegUnitSets.back());
-    if (SetI != std::prev(RegUnitSets.end()))
-      RegUnitSets.pop_back();
+    if (findRegUnitSet(RegUnitSets, RUSet) == RegUnitSets.end())
+      RegUnitSets.push_back(std::move(RUSet));
   }
 
   if (RegUnitSets.empty())
@@ -2023,43 +2019,36 @@ void CodeGenRegBank::computeRegUnitSets() {
     // In theory, this is combinatorial. In practice, it needs to be bounded
     // by a small number of sets for regpressure to be efficient.
     // If the assert is hit, we need to implement pruning.
-    assert(Idx < (2*NumRegUnitSubSets) && "runaway unit set inference");
+    assert(Idx < (2 * NumRegUnitSubSets) && "runaway unit set inference");
 
     // Compare new sets with all original classes.
-    for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx+1;
+    for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx + 1;
          SearchIdx != EndIdx; ++SearchIdx) {
-      std::set<unsigned> Intersection;
-      std::set_intersection(RegUnitSets[Idx].Units.begin(),
-                            RegUnitSets[Idx].Units.end(),
-                            RegUnitSets[SearchIdx].Units.begin(),
-                            RegUnitSets[SearchIdx].Units.end(),
-                            std::inserter(Intersection, Intersection.begin()));
+      std::vector<unsigned> Intersection;
+      std::set_intersection(
+          RegUnitSets[Idx].Units.begin(), RegUnitSets[Idx].Units.end(),
+          RegUnitSets[SearchIdx].Units.begin(),
+          RegUnitSets[SearchIdx].Units.end(), std::back_inserter(Intersection));
       if (Intersection.empty())
         continue;
 
-      // Speculatively grow the RegUnitSets to hold the new set.
-      RegUnitSets.resize(RegUnitSets.size() + 1);
-      RegUnitSets.back().Name =
-        RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
-
+      RegUnitSet RUSet;
+      RUSet.Name =
+          RegUnitSets[Idx].Name + "_with_" + RegUnitSets[SearchIdx].Name;
       std::set_union(RegUnitSets[Idx].Units.begin(),
                      RegUnitSets[Idx].Units.end(),
                      RegUnitSets[SearchIdx].Units.begin(),
                      RegUnitSets[SearchIdx].Units.end(),
-                     std::inserter(RegUnitSets.back().Units,
-                                   RegUnitSets.back().Units.begin()));
+                     std::inserter(RUSet.Units, RUSet.Units.begin()));
 
       // Find an existing RegUnitSet, or add the union to the unique sets.
-      std::vector<RegUnitSet>::const_iterator SetI =
-        findRegUnitSet(RegUnitSets, RegUnitSets.back());
-      if (SetI != std::prev(RegUnitSets.end()))
-        RegUnitSets.pop_back();
-      else {
-        LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() - 1 << " "
-                          << RegUnitSets.back().Name << ":";
+      if (findRegUnitSet(RegUnitSets, RUSet) == RegUnitSets.end()) {
+        LLVM_DEBUG(dbgs() << "UnitSet " << RegUnitSets.size() << " "
+                          << RUSet.Name << ":";
                    for (auto &U
-                        : RegUnitSets.back().Units) printRegUnitName(U);
+                        : RUSet.Units) printRegUnitName(U);
                    dbgs() << "\n";);
+        RegUnitSets.push_back(std::move(RUSet));
       }
     }
   }
@@ -2098,8 +2087,8 @@ void CodeGenRegBank::computeRegUnitSets() {
                dbgs() << "\n  UnitSetIDs:");
 
     // Find all supersets.
-    for (unsigned USIdx = 0, USEnd = RegUnitSets.size();
-         USIdx != USEnd; ++USIdx) {
+    for (unsigned USIdx = 0, USEnd = RegUnitSets.size(); USIdx != USEnd;
+         ++USIdx) {
       if (isRegUnitSubSet(RCRegUnits, RegUnitSets[USIdx].Units)) {
         LLVM_DEBUG(dbgs() << " " << USIdx);
         RegClassUnitSets[RCIdx].push_back(USIdx);
@@ -2114,18 +2103,16 @@ void CodeGenRegBank::computeRegUnitSets() {
   // contain the unit. Normally, this matches an existing list of UnitSets for a
   // register class. If not, we create a new entry in RegClassUnitSets as a
   // "fake" register class.
-  for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits;
-       UnitIdx < UnitEnd; ++UnitIdx) {
+  for (unsigned UnitIdx = 0, UnitEnd = NumNativeRegUnits; UnitIdx < UnitEnd;
+       ++UnitIdx) {
     std::vector<unsigned> RUSets;
     for (unsigned i = 0, e = RegUnitSets.size(); i != e; ++i) {
-      RegUnitSet &RUSet = RegUnitSets[i];
-      if (!is_contained(RUSet.Units, UnitIdx))
-        continue;
-      RUSets.push_back(i);
+      if (is_contained(RegUnitSets[i].Units, UnitIdx))
+        RUSets.push_back(i);
     }
     unsigned RCUnitSetsIdx = 0;
-    for (unsigned e = RegClassUnitSets.size();
-         RCUnitSetsIdx != e; ++RCUnitSetsIdx) {
+    for (unsigned e = RegClassUnitSets.size(); RCUnitSetsIdx != e;
+         ++RCUnitSetsIdx) {
       if (RegClassUnitSets[RCUnitSetsIdx] == RUSets) {
         break;
       }
@@ -2133,8 +2120,7 @@ void CodeGenRegBank::computeRegUnitSets() {
     RegUnits[UnitIdx].RegClassUnitSetsIdx = RCUnitSetsIdx;
     if (RCUnitSetsIdx == RegClassUnitSets.size()) {
       // Create a new list of UnitSets as a "fake" register class.
-      RegClassUnitSets.resize(RCUnitSetsIdx + 1);
-      RegClassUnitSets[RCUnitSetsIdx].swap(RUSets);
+      RegClassUnitSets.push_back(std::move(RUSets));
     }
   }
 }
@@ -2301,9 +2287,8 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
       continue;
     }
     // This is a real subset.  See if we have a matching class.
-    CodeGenRegisterClass *SubRC =
-      getOrCreateSubClass(RC, &I->second,
-                          RC->getName() + "_with_" + I->first->getName());
+    CodeGenRegisterClass *SubRC = getOrCreateSubClass(
+        RC, &I->second, RC->getName() + "_with_" + I->first->getName());
     RC->setSubClassWithSubReg(&SubIdx, SubRC);
   }
 }
@@ -2315,8 +2300,9 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
 // has a maximal result for any SubIdx and any X >= FirstSubRegRC.
 //
 
-void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
-                                                std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
+void CodeGenRegBank::inferMatchingSuperRegClass(
+    CodeGenRegisterClass *RC,
+    std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
   DenseMap<const CodeGenRegister *, std::vector<const CodeGenRegister *>>
       SubToSuperRegs;
   BitVector TopoSigs(getNumTopoSigs());
@@ -2374,9 +2360,9 @@ void CodeGenRegBank::inferMatchingSuperRegClass(CodeGenRegisterClass *RC,
 
       // Only a subset of RC maps into SubRC. Make sure it is represented by a
       // class.
-      getOrCreateSubClass(RC, &SubSetVec, RC->getName() + "_with_" +
-                                          SubIdx.getName() + "_in_" +
-                                          SubRC.getName());
+      getOrCreateSubClass(RC, &SubSetVec,
+                          RC->getName() + "_with_" + SubIdx.getName() + "_in_" +
+                              SubRC.getName());
     }
   }
 }
@@ -2431,8 +2417,7 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
 /// return null. If the register is in multiple classes, and the classes have a
 /// superset-subset relationship and the same set of types, return the
 /// superclass.  Otherwise return null.
-const CodeGenRegisterClass*
-CodeGenRegBank::getRegClassForRegister(Record *R) {
+const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
   const CodeGenRegister *Reg = getReg(R);
   const CodeGenRegisterClass *FoundRC = nullptr;
   for (const auto &RC : getRegClasses()) {
@@ -2477,8 +2462,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
   const CodeGenRegister *Reg = getReg(RegRecord);
   const CodeGenRegisterClass *BestRC = nullptr;
   for (const auto &RC : getRegClasses()) {
-    if ((!VT || RC.hasType(*VT)) &&
-        RC.contains(Reg) && (!BestRC || BestRC->hasSubClass(&RC)))
+    if ((!VT || RC.hasType(*VT)) && RC.contains(Reg) &&
+        (!BestRC || BestRC->hasSubClass(&RC)))
       BestRC = &RC;
   }
 
@@ -2486,8 +2471,8 @@ CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
   return BestRC;
 }
 
-BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record*> Regs) {
-  SetVector<const CodeGenRegister*> Set;
+BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record *> Regs) {
+  SetVector<const CodeGenRegister *> Set;
 
   // First add Regs with all sub-registers.
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index 97f60811a7d8..fc5cd67e2d55 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -44,810 +44,797 @@
 
 namespace llvm {
 
-  class CodeGenRegBank;
+class CodeGenRegBank;
 
-  /// Used to encode a step in a register lane mask transformation.
-  /// Mask the bits specified in Mask, then rotate them Rol bits to the left
-  /// assuming a wraparound at 32bits.
-  struct MaskRolPair {
-    LaneBitmask Mask;
-    uint8_t RotateLeft;
+/// Used to encode a step in a register lane mask transformation.
+/// Mask the bits specified in Mask, then rotate them Rol bits to the left
+/// assuming a wraparound at 32bits.
+struct MaskRolPair {
+  LaneBitmask Mask;
+  uint8_t RotateLeft;
 
-    bool operator==(const MaskRolPair Other) const {
-      return Mask == Other.Mask && RotateLeft == Other.RotateLeft;
-    }
-    bool operator!=(const MaskRolPair Other) const {
-      return Mask != Other.Mask || RotateLeft != Other.RotateLeft;
-    }
-  };
-
-  /// CodeGenSubRegIndex - Represents a sub-register index.
-  class CodeGenSubRegIndex {
-    Record *const TheDef;
-    std::string Name;
-    std::string Namespace;
-
-  public:
-    uint16_t Size;
-    uint16_t Offset;
-    const unsigned EnumValue;
-    mutable LaneBitmask LaneMask;
-    mutable SmallVector<MaskRolPair,1> CompositionLaneMaskTransform;
-
-    /// A list of subregister indexes concatenated resulting in this
-    /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx.
-    SmallVector<CodeGenSubRegIndex*,4> ConcatenationOf;
-
-    // Are all super-registers containing this SubRegIndex covered by their
-    // sub-registers?
-    bool AllSuperRegsCovered;
-    // A subregister index is "artificial" if every subregister obtained
-    // from applying this index is artificial. Artificial subregister
-    // indexes are not used to create new register classes.
-    bool Artificial;
-
-    CodeGenSubRegIndex(Record *R, unsigned Enum);
-    CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
-    CodeGenSubRegIndex(CodeGenSubRegIndex&) = delete;
-
-    const std::string &getName() const { return Name; }
-    const std::string &getNamespace() const { return Namespace; }
-    std::string getQualifiedName() const;
-
-    // Map of composite subreg indices.
-    typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
-                     deref<std::less<>>>
-        CompMap;
-
-    // Returns the subreg index that results from composing this with Idx.
-    // Returns NULL if this and Idx don't compose.
-    CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const {
-      CompMap::const_iterator I = Composed.find(Idx);
-      return I == Composed.end() ? nullptr : I->second;
-    }
+  bool operator==(const MaskRolPair Other) const {
+    return Mask == Other.Mask && RotateLeft == Other.RotateLeft;
+  }
+  bool operator!=(const MaskRolPair Other) const {
+    return Mask != Other.Mask || RotateLeft != Other.RotateLeft;
+  }
+};
+
+/// CodeGenSubRegIndex - Represents a sub-register index.
+class CodeGenSubRegIndex {
+  Record *const TheDef;
+  std::string Name;
+  std::string Namespace;
+
+public:
+  uint16_t Size;
+  uint16_t Offset;
+  const unsigned EnumValue;
+  mutable LaneBitmask LaneMask;
+  mutable SmallVector<MaskRolPair, 1> CompositionLaneMaskTransform;
+
+  /// A list of subregister indexes concatenated resulting in this
+  /// subregister index. This is the reverse of CodeGenRegBank::ConcatIdx.
+  SmallVector<CodeGenSubRegIndex *, 4> ConcatenationOf;
+
+  // Are all super-registers containing this SubRegIndex covered by their
+  // sub-registers?
+  bool AllSuperRegsCovered;
+  // A subregister index is "artificial" if every subregister obtained
+  // from applying this index is artificial. Artificial subregister
+  // indexes are not used to create new register classes.
+  bool Artificial;
+
+  CodeGenSubRegIndex(Record *R, unsigned Enum);
+  CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
+  CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete;
+
+  const std::string &getName() const { return Name; }
+  const std::string &getNamespace() const { return Namespace; }
+  std::string getQualifiedName() const;
+
+  // Map of composite subreg indices.
+  typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
+                   deref<std::less<>>>
+      CompMap;
+
+  // Returns the subreg index that results from composing this with Idx.
+  // Returns NULL if this and Idx don't compose.
+  CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const {
+    CompMap::const_iterator I = Composed.find(Idx);
+    return I == Composed.end() ? nullptr : I->second;
+  }
 
-    // Add a composite subreg index: this+A = B.
-    // Return a conflicting composite, or NULL
-    CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A,
-                                     CodeGenSubRegIndex *B) {
-      assert(A && B);
-      std::pair<CompMap::iterator, bool> Ins =
+  // Add a composite subreg index: this+A = B.
+  // Return a conflicting composite, or NULL
+  CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A,
+                                   CodeGenSubRegIndex *B) {
+    assert(A && B);
+    std::pair<CompMap::iterator, bool> Ins =
         Composed.insert(std::make_pair(A, B));
-      // Synthetic subreg indices that aren't contiguous (for instance ARM
-      // register tuples) don't have a bit range, so it's OK to let
-      // B->Offset == -1. For the other cases, accumulate the offset and set
-      // the size here. Only do so if there is no offset yet though.
-      if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) &&
-          (B->Offset == (uint16_t)-1)) {
-        B->Offset = Offset + A->Offset;
-        B->Size = A->Size;
-      }
-      return (Ins.second || Ins.first->second == B) ? nullptr
-                                                    : Ins.first->second;
-    }
-
-    // Update the composite maps of components specified in 'ComposedOf'.
-    void updateComponents(CodeGenRegBank&);
-
-    // Return the map of composites.
-    const CompMap &getComposites() const { return Composed; }
-
-    // Compute LaneMask from Composed. Return LaneMask.
-    LaneBitmask computeLaneMask() const;
-
-    void setConcatenationOf(ArrayRef<CodeGenSubRegIndex*> Parts);
-
-    /// Replaces subregister indexes in the `ConcatenationOf` list with
-    /// list of subregisters they are composed of (if any). Do this recursively.
-    void computeConcatTransitiveClosure();
-
-    bool operator<(const CodeGenSubRegIndex &RHS) const {
-      return this->EnumValue < RHS.EnumValue;
-    }
-
-  private:
-    CompMap Composed;
-  };
-
-  /// CodeGenRegister - Represents a register definition.
-  class CodeGenRegister {
-  public:
-    Record *TheDef;
-    unsigned EnumValue;
-    std::vector<int64_t> CostPerUse;
-    bool CoveredBySubRegs = true;
-    bool HasDisjunctSubRegs = false;
-    bool Artificial = true;
-    bool Constant = false;
+    // Synthetic subreg indices that aren't contiguous (for instance ARM
+    // register tuples) don't have a bit range, so it's OK to let
+    // B->Offset == -1. For the other cases, accumulate the offset and set
+    // the size here. Only do so if there is no offset yet though.
+    if ((Offset != (uint16_t)-1 && A->Offset != (uint16_t)-1) &&
+        (B->Offset == (uint16_t)-1)) {
+      B->Offset = Offset + A->Offset;
+      B->Size = A->Size;
+    }
+    return (Ins.second || Ins.first->second == B) ? nullptr : Ins.first->second;
+  }
 
-    // Map SubRegIndex -> Register.
-    typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *,
-                     deref<std::less<>>>
-        SubRegMap;
+  // Update the composite maps of components specified in 'ComposedOf'.
+  void updateComponents(CodeGenRegBank &);
 
-    CodeGenRegister(Record *R, unsigned Enum);
+  // Return the map of composites.
+  const CompMap &getComposites() const { return Composed; }
 
-    StringRef getName() const;
+  // Compute LaneMask from Composed. Return LaneMask.
+  LaneBitmask computeLaneMask() const;
 
-    // Extract more information from TheDef. This is used to build an object
-    // graph after all CodeGenRegister objects have been created.
-    void buildObjectGraph(CodeGenRegBank&);
+  void setConcatenationOf(ArrayRef<CodeGenSubRegIndex *> Parts);
 
-    // Lazily compute a map of all sub-registers.
-    // This includes unique entries for all sub-sub-registers.
-    const SubRegMap &computeSubRegs(CodeGenRegBank&);
+  /// Replaces subregister indexes in the `ConcatenationOf` list with
+  /// list of subregisters they are composed of (if any). Do this recursively.
+  void computeConcatTransitiveClosure();
 
-    // Compute extra sub-registers by combining the existing sub-registers.
-    void computeSecondarySubRegs(CodeGenRegBank&);
+  bool operator<(const CodeGenSubRegIndex &RHS) const {
+    return this->EnumValue < RHS.EnumValue;
+  }
 
-    // Add this as a super-register to all sub-registers after the sub-register
-    // graph has been built.
-    void computeSuperRegs(CodeGenRegBank&);
+private:
+  CompMap Composed;
+};
 
-    const SubRegMap &getSubRegs() const {
-      assert(SubRegsComplete && "Must precompute sub-registers");
-      return SubRegs;
-    }
+/// CodeGenRegister - Represents a register definition.
+class CodeGenRegister {
+public:
+  Record *TheDef;
+  unsigned EnumValue;
+  std::vector<int64_t> CostPerUse;
+  bool CoveredBySubRegs = true;
+  bool HasDisjunctSubRegs = false;
+  bool Artificial = true;
+  bool Constant = false;
 
-    // Add sub-registers to OSet following a pre-order defined by the .td file.
-    void addSubRegsPreOrder(SetVector<const CodeGenRegister*> &OSet,
-                            CodeGenRegBank&) const;
+  // Map SubRegIndex -> Register.
+  typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<std::less<>>>
+      SubRegMap;
 
-    // Return the sub-register index naming Reg as a sub-register of this
-    // register. Returns NULL if Reg is not a sub-register.
-    CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const {
-      return SubReg2Idx.lookup(Reg);
-    }
+  CodeGenRegister(Record *R, unsigned Enum);
 
-    typedef std::vector<const CodeGenRegister*> SuperRegList;
+  StringRef getName() const;
 
-    // Get the list of super-registers in topological order, small to large.
-    // This is valid after computeSubRegs visits all registers during RegBank
-    // construction.
-    const SuperRegList &getSuperRegs() const {
-      assert(SubRegsComplete && "Must precompute sub-registers");
-      return SuperRegs;
-    }
+  // Extract more information from TheDef. This is used to build an object
+  // graph after all CodeGenRegister objects have been created.
+  void buildObjectGraph(CodeGenRegBank &);
 
-    // Get the list of ad hoc aliases. The graph is symmetric, so the list
-    // contains all registers in 'Aliases', and all registers that mention this
-    // register in 'Aliases'.
-    ArrayRef<CodeGenRegister*> getExplicitAliases() const {
-      return ExplicitAliases;
-    }
+  // Lazily compute a map of all sub-registers.
+  // This includes unique entries for all sub-sub-registers.
+  const SubRegMap &computeSubRegs(CodeGenRegBank &);
 
-    // Get the topological signature of this register. This is a small integer
-    // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have
-    // identical sub-register structure. That is, they support the same set of
-    // sub-register indices mapping to the same kind of sub-registers
-    // (TopoSig-wise).
-    unsigned getTopoSig() const {
-      assert(SuperRegsComplete && "TopoSigs haven't been computed yet.");
-      return TopoSig;
-    }
+  // Compute extra sub-registers by combining the existing sub-registers.
+  void computeSecondarySubRegs(CodeGenRegBank &);
 
-    // List of register units in ascending order.
-    typedef SparseBitVector<> RegUnitList;
-    typedef SmallVector<LaneBitmask, 16> RegUnitLaneMaskList;
+  // Add this as a super-register to all sub-registers after the sub-register
+  // graph has been built.
+  void computeSuperRegs(CodeGenRegBank &);
 
-    // How many entries in RegUnitList are native?
-    RegUnitList NativeRegUnits;
+  const SubRegMap &getSubRegs() const {
+    assert(SubRegsComplete && "Must precompute sub-registers");
+    return SubRegs;
+  }
 
-    // Get the list of register units.
-    // This is only valid after computeSubRegs() completes.
-    const RegUnitList &getRegUnits() const { return RegUnits; }
+  // Add sub-registers to OSet following a pre-order defined by the .td file.
+  void addSubRegsPreOrder(SetVector<const CodeGenRegister *> &OSet,
+                          CodeGenRegBank &) const;
 
-    ArrayRef<LaneBitmask> getRegUnitLaneMasks() const {
-      return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count());
-    }
+  // Return the sub-register index naming Reg as a sub-register of this
+  // register. Returns NULL if Reg is not a sub-register.
+  CodeGenSubRegIndex *getSubRegIndex(const CodeGenRegister *Reg) const {
+    return SubReg2Idx.lookup(Reg);
+  }
 
-    // Get the native register units. This is a prefix of getRegUnits().
-    RegUnitList getNativeRegUnits() const {
-      return NativeRegUnits;
-    }
+  typedef std::vector<const CodeGenRegister *> SuperRegList;
 
-    void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) {
-      RegUnitLaneMasks = LaneMasks;
-    }
+  // Get the list of super-registers in topological order, small to large.
+  // This is valid after computeSubRegs visits all registers during RegBank
+  // construction.
+  const SuperRegList &getSuperRegs() const {
+    assert(SubRegsComplete && "Must precompute sub-registers");
+    return SuperRegs;
+  }
 
-    // Inherit register units from subregisters.
-    // Return true if the RegUnits changed.
-    bool inheritRegUnits(CodeGenRegBank &RegBank);
+  // Get the list of ad hoc aliases. The graph is symmetric, so the list
+  // contains all registers in 'Aliases', and all registers that mention this
+  // register in 'Aliases'.
+  ArrayRef<CodeGenRegister *> getExplicitAliases() const {
+    return ExplicitAliases;
+  }
 
-    // Adopt a register unit for pressure tracking.
-    // A unit is adopted iff its unit number is >= NativeRegUnits.count().
-    void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); }
+  // Get the topological signature of this register. This is a small integer
+  // less than RegBank.getNumTopoSigs(). Registers with the same TopoSig have
+  // identical sub-register structure. That is, they support the same set of
+  // sub-register indices mapping to the same kind of sub-registers
+  // (TopoSig-wise).
+  unsigned getTopoSig() const {
+    assert(SuperRegsComplete && "TopoSigs haven't been computed yet.");
+    return TopoSig;
+  }
 
-    // Get the sum of this register's register unit weights.
-    unsigned getWeight(const CodeGenRegBank &RegBank) const;
+  // List of register units in ascending order.
+  typedef SparseBitVector<> RegUnitList;
+  typedef SmallVector<LaneBitmask, 16> RegUnitLaneMaskList;
 
-    // Canonically ordered set.
-    typedef std::vector<const CodeGenRegister*> Vec;
+  // How many entries in RegUnitList are native?
+  RegUnitList NativeRegUnits;
 
-  private:
-    bool SubRegsComplete;
-    bool SuperRegsComplete;
-    unsigned TopoSig;
+  // Get the list of register units.
+  // This is only valid after computeSubRegs() completes.
+  const RegUnitList &getRegUnits() const { return RegUnits; }
 
-    // The sub-registers explicit in the .td file form a tree.
-    SmallVector<CodeGenSubRegIndex*, 8> ExplicitSubRegIndices;
-    SmallVector<CodeGenRegister*, 8> ExplicitSubRegs;
+  ArrayRef<LaneBitmask> getRegUnitLaneMasks() const {
+    return ArrayRef(RegUnitLaneMasks).slice(0, NativeRegUnits.count());
+  }
 
-    // Explicit ad hoc aliases, symmetrized to form an undirected graph.
-    SmallVector<CodeGenRegister*, 8> ExplicitAliases;
+  // Get the native register units. This is a prefix of getRegUnits().
+  RegUnitList getNativeRegUnits() const { return NativeRegUnits; }
 
-    // Super-registers where this is the first explicit sub-register.
-    SuperRegList LeadingSuperRegs;
+  void setRegUnitLaneMasks(const RegUnitLaneMaskList &LaneMasks) {
+    RegUnitLaneMasks = LaneMasks;
+  }
 
-    SubRegMap SubRegs;
-    SuperRegList SuperRegs;
-    DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*> SubReg2Idx;
-    RegUnitList RegUnits;
-    RegUnitLaneMaskList RegUnitLaneMasks;
-  };
+  // Inherit register units from subregisters.
+  // Return true if the RegUnits changed.
+  bool inheritRegUnits(CodeGenRegBank &RegBank);
+
+  // Adopt a register unit for pressure tracking.
+  // A unit is adopted iff its unit number is >= NativeRegUnits.count().
+  void adoptRegUnit(unsigned RUID) { RegUnits.set(RUID); }
+
+  // Get the sum of this register's register unit weights.
+  unsigned getWeight(const CodeGenRegBank &RegBank) const;
+
+  // Canonically ordered set.
+  typedef std::vector<const CodeGenRegister *> Vec;
+
+private:
+  bool SubRegsComplete;
+  bool SuperRegsComplete;
+  unsigned TopoSig;
+
+  // The sub-registers explicit in the .td file form a tree.
+  SmallVector<CodeGenSubRegIndex *, 8> ExplicitSubRegIndices;
+  SmallVector<CodeGenRegister *, 8> ExplicitSubRegs;
+
+  // Explicit ad hoc aliases, symmetrized to form an undirected graph.
+  SmallVector<CodeGenRegister *, 8> ExplicitAliases;
+
+  // Super-registers where this is the first explicit sub-register.
+  SuperRegList LeadingSuperRegs;
+
+  SubRegMap SubRegs;
+  SuperRegList SuperRegs;
+  DenseMap<const CodeGenRegister *, CodeGenSubRegIndex *> SubReg2Idx;
+  RegUnitList RegUnits;
+  RegUnitLaneMaskList RegUnitLaneMasks;
+};
+
+inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) {
+  return A.EnumValue < B.EnumValue;
+}
+
+inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
+  return A.EnumValue == B.EnumValue;
+}
+
+class CodeGenRegisterClass {
+  CodeGenRegister::Vec Members;
+  // Allocation orders. Order[0] always contains all registers in Members.
+  std::vector<SmallVector<Record *, 16>> Orders;
+  // Bit mask of sub-classes including this, indexed by their EnumValue.
+  BitVector SubClasses;
+  // List of super-classes, topologocally ordered to have the larger classes
+  // first.  This is the same as sorting by EnumValue.
+  SmallVector<CodeGenRegisterClass *, 4> SuperClasses;
+  Record *TheDef;
+  std::string Name;
+
+  // For a synthesized class, inherit missing properties from the nearest
+  // super-class.
+  void inheritProperties(CodeGenRegBank &);
+
+  // Map SubRegIndex -> sub-class.  This is the largest sub-class where all
+  // registers have a SubRegIndex sub-register.
+  DenseMap<const CodeGenSubRegIndex *, CodeGenRegisterClass *>
+      SubClassWithSubReg;
+
+  // Map SubRegIndex -> set of super-reg classes.  This is all register
+  // classes SuperRC such that:
+  //
+  //   R:SubRegIndex in this RC for all R in SuperRC.
+  //
+  DenseMap<const CodeGenSubRegIndex *, SmallPtrSet<CodeGenRegisterClass *, 8>>
+      SuperRegClasses;
+
+  // Bit vector of TopoSigs for the registers in this class. This will be
+  // very sparse on regular architectures.
+  BitVector TopoSigs;
+
+public:
+  unsigned EnumValue;
+  StringRef Namespace;
+  SmallVector<ValueTypeByHwMode, 4> VTs;
+  RegSizeInfoByHwMode RSI;
+  int CopyCost;
+  bool Allocatable;
+  StringRef AltOrderSelect;
+  uint8_t AllocationPriority;
+  bool GlobalPriority;
+  uint8_t TSFlags;
+  /// Contains the combination of the lane masks of all subregisters.
+  LaneBitmask LaneMask;
+  /// True if there are at least 2 subregisters which do not interfere.
+  bool HasDisjunctSubRegs;
+  bool CoveredBySubRegs;
+  /// A register class is artificial if all its members are artificial.
+  bool Artificial;
+  /// Generate register pressure set for this register class and any class
+  /// synthesized from it.
+  bool GeneratePressureSet;
+
+  // Return the Record that defined this class, or NULL if the class was
+  // created by TableGen.
+  Record *getDef() const { return TheDef; }
+
+  std::string getNamespaceQualification() const;
+  const std::string &getName() const { return Name; }
+  std::string getQualifiedName() const;
+  std::string getIdName() const;
+  std::string getQualifiedIdName() const;
+  ArrayRef<ValueTypeByHwMode> getValueTypes() const { return VTs; }
+  unsigned getNumValueTypes() const { return VTs.size(); }
+  bool hasType(const ValueTypeByHwMode &VT) const;
+
+  const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
+    if (VTNum < VTs.size())
+      return VTs[VTNum];
+    llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!");
+  }
 
-  inline bool operator<(const CodeGenRegister &A, const CodeGenRegister &B) {
-    return A.EnumValue < B.EnumValue;
-  }
-
-  inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
-    return A.EnumValue == B.EnumValue;
-  }
-
-  class CodeGenRegisterClass {
-    CodeGenRegister::Vec Members;
-    // Allocation orders. Order[0] always contains all registers in Members.
-    std::vector<SmallVector<Record*, 16>> Orders;
-    // Bit mask of sub-classes including this, indexed by their EnumValue.
-    BitVector SubClasses;
-    // List of super-classes, topologocally ordered to have the larger classes
-    // first.  This is the same as sorting by EnumValue.
-    SmallVector<CodeGenRegisterClass*, 4> SuperClasses;
-    Record *TheDef;
-    std::string Name;
-
-    // For a synthesized class, inherit missing properties from the nearest
-    // super-class.
-    void inheritProperties(CodeGenRegBank&);
-
-    // Map SubRegIndex -> sub-class.  This is the largest sub-class where all
-    // registers have a SubRegIndex sub-register.
-    DenseMap<const CodeGenSubRegIndex *, CodeGenRegisterClass *>
-        SubClassWithSubReg;
-
-    // Map SubRegIndex -> set of super-reg classes.  This is all register
-    // classes SuperRC such that:
-    //
-    //   R:SubRegIndex in this RC for all R in SuperRC.
-    //
-    DenseMap<const CodeGenSubRegIndex *, SmallPtrSet<CodeGenRegisterClass *, 8>>
-        SuperRegClasses;
-
-    // Bit vector of TopoSigs for the registers in this class. This will be
-    // very sparse on regular architectures.
-    BitVector TopoSigs;
-
-  public:
-    unsigned EnumValue;
-    StringRef Namespace;
-    SmallVector<ValueTypeByHwMode, 4> VTs;
-    RegSizeInfoByHwMode RSI;
-    int CopyCost;
-    bool Allocatable;
-    StringRef AltOrderSelect;
-    uint8_t AllocationPriority;
-    bool GlobalPriority;
-    uint8_t TSFlags;
-    /// Contains the combination of the lane masks of all subregisters.
-    LaneBitmask LaneMask;
-    /// True if there are at least 2 subregisters which do not interfere.
-    bool HasDisjunctSubRegs;
-    bool CoveredBySubRegs;
-    /// A register class is artificial if all its members are artificial.
-    bool Artificial;
-    /// Generate register pressure set for this register class and any class
-    /// synthesized from it.
-    bool GeneratePressureSet;
-
-    // Return the Record that defined this class, or NULL if the class was
-    // created by TableGen.
-    Record *getDef() const { return TheDef; }
-
-    std::string getNamespaceQualification() const;
-    const std::string &getName() const { return Name; }
-    std::string getQualifiedName() const;
-    std::string getIdName() const;
-    std::string getQualifiedIdName() const;
-    ArrayRef<ValueTypeByHwMode> getValueTypes() const { return VTs; }
-    unsigned getNumValueTypes() const { return VTs.size(); }
-    bool hasType(const ValueTypeByHwMode &VT) const;
-
-    const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
-      if (VTNum < VTs.size())
-        return VTs[VTNum];
-      llvm_unreachable("VTNum greater than number of ValueTypes in RegClass!");
-    }
+  // Return true if this class contains the register.
+  bool contains(const CodeGenRegister *) const;
 
-    // Return true if this class contains the register.
-    bool contains(const CodeGenRegister*) const;
-
-    // Returns true if RC is a subclass.
-    // RC is a sub-class of this class if it is a valid replacement for any
-    // instruction operand where a register of this classis required. It must
-    // satisfy these conditions:
-    //
-    // 1. All RC registers are also in this.
-    // 2. The RC spill size must not be smaller than our spill size.
-    // 3. RC spill alignment must be compatible with ours.
-    //
-    bool hasSubClass(const CodeGenRegisterClass *RC) const {
-      return SubClasses.test(RC->EnumValue);
-    }
+  // Returns true if RC is a subclass.
+  // RC is a sub-class of this class if it is a valid replacement for any
+  // instruction operand where a register of this classis required. It must
+  // satisfy these conditions:
+  //
+  // 1. All RC registers are also in this.
+  // 2. The RC spill size must not be smaller than our spill size.
+  // 3. RC spill alignment must be compatible with ours.
+  //
+  bool hasSubClass(const CodeGenRegisterClass *RC) const {
+    return SubClasses.test(RC->EnumValue);
+  }
 
-    // getSubClassWithSubReg - Returns the largest sub-class where all
-    // registers have a SubIdx sub-register.
-    CodeGenRegisterClass *
-    getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const {
-      return SubClassWithSubReg.lookup(SubIdx);
-    }
+  // getSubClassWithSubReg - Returns the largest sub-class where all
+  // registers have a SubIdx sub-register.
+  CodeGenRegisterClass *
+  getSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx) const {
+    return SubClassWithSubReg.lookup(SubIdx);
+  }
 
-    /// Find largest subclass where all registers have SubIdx subregisters in
-    /// SubRegClass and the largest subregister class that contains those
-    /// subregisters without (as far as possible) also containing additional registers.
-    ///
-    /// This can be used to find a suitable pair of classes for subregister copies.
-    /// \return std::pair<SubClass, SubRegClass> where SubClass is a SubClass is
-    /// a class where every register has SubIdx and SubRegClass is a class where
-    /// every register is covered by the SubIdx subregister of SubClass.
-    std::optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
-    getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank,
-                                   const CodeGenSubRegIndex *SubIdx) const;
-
-    void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
-                               CodeGenRegisterClass *SubRC) {
-      SubClassWithSubReg[SubIdx] = SubRC;
-    }
+  /// Find largest subclass where all registers have SubIdx subregisters in
+  /// SubRegClass and the largest subregister class that contains those
+  /// subregisters without (as far as possible) also containing additional
+  /// registers.
+  ///
+  /// This can be used to find a suitable pair of classes for subregister
+  /// copies. \return std::pair<SubClass, SubRegClass> where SubClass is a
+  /// SubClass is a class where every register has SubIdx and SubRegClass is a
+  /// class where every register is covered by the SubIdx subregister of
+  /// SubClass.
+  std::optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
+  getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank,
+                                 const CodeGenSubRegIndex *SubIdx) const;
+
+  void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
+                             CodeGenRegisterClass *SubRC) {
+    SubClassWithSubReg[SubIdx] = SubRC;
+  }
 
-    // getSuperRegClasses - Returns a bit vector of all register classes
-    // containing only SubIdx super-registers of this class.
-    void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
-                            BitVector &Out) const;
+  // getSuperRegClasses - Returns a bit vector of all register classes
+  // containing only SubIdx super-registers of this class.
+  void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
+                          BitVector &Out) const;
 
-    // addSuperRegClass - Add a class containing only SubIdx super-registers.
-    void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
-                          CodeGenRegisterClass *SuperRC) {
-      SuperRegClasses[SubIdx].insert(SuperRC);
-    }
+  // addSuperRegClass - Add a class containing only SubIdx super-registers.
+  void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
+                        CodeGenRegisterClass *SuperRC) {
+    SuperRegClasses[SubIdx].insert(SuperRC);
+  }
 
-    // getSubClasses - Returns a constant BitVector of subclasses indexed by
-    // EnumValue.
-    // The SubClasses vector includes an entry for this class.
-    const BitVector &getSubClasses() const { return SubClasses; }
+  // getSubClasses - Returns a constant BitVector of subclasses indexed by
+  // EnumValue.
+  // The SubClasses vector includes an entry for this class.
+  const BitVector &getSubClasses() const { return SubClasses; }
 
-    // getSuperClasses - Returns a list of super classes ordered by EnumValue.
-    // The array does not include an entry for this class.
-    ArrayRef<CodeGenRegisterClass*> getSuperClasses() const {
-      return SuperClasses;
-    }
+  // getSuperClasses - Returns a list of super classes ordered by EnumValue.
+  // The array does not include an entry for this class.
+  ArrayRef<CodeGenRegisterClass *> getSuperClasses() const {
+    return SuperClasses;
+  }
 
-    // Returns an ordered list of class members.
-    // The order of registers is the same as in the .td file.
-    // No = 0 is the default allocation order, No = 1 is the first alternative.
-    ArrayRef<Record*> getOrder(unsigned No = 0) const {
-        return Orders[No];
-    }
+  // Returns an ordered list of class members.
+  // The order of registers is the same as in the .td file.
+  // No = 0 is the default allocation order, No = 1 is the first alternative.
+  ArrayRef<Record *> getOrder(unsigned No = 0) const { return Orders[No]; }
 
-    // Return the total number of allocation orders available.
-    unsigned getNumOrders() const { return Orders.size(); }
+  // Return the total number of allocation orders available.
+  unsigned getNumOrders() const { return Orders.size(); }
 
-    // Get the set of registers.  This set contains the same registers as
-    // getOrder(0).
-    const CodeGenRegister::Vec &getMembers() const { return Members; }
+  // Get the set of registers.  This set contains the same registers as
+  // getOrder(0).
+  const CodeGenRegister::Vec &getMembers() const { return Members; }
 
-    // Get a bit vector of TopoSigs present in this register class.
-    const BitVector &getTopoSigs() const { return TopoSigs; }
+  // Get a bit vector of TopoSigs present in this register class.
+  const BitVector &getTopoSigs() const { return TopoSigs; }
 
-    // Get a weight of this register class.
-    unsigned getWeight(const CodeGenRegBank&) const;
+  // Get a weight of this register class.
+  unsigned getWeight(const CodeGenRegBank &) const;
 
-    // Populate a unique sorted list of units from a register set.
-    void buildRegUnitSet(const CodeGenRegBank &RegBank,
-                         std::vector<unsigned> &RegUnits) const;
+  // Populate a unique sorted list of units from a register set.
+  void buildRegUnitSet(const CodeGenRegBank &RegBank,
+                       std::vector<unsigned> &RegUnits) const;
 
-    CodeGenRegisterClass(CodeGenRegBank&, Record *R);
-    CodeGenRegisterClass(CodeGenRegisterClass&) = delete;
+  CodeGenRegisterClass(CodeGenRegBank &, Record *R);
+  CodeGenRegisterClass(CodeGenRegisterClass &) = delete;
 
-    // A key representing the parts of a register class used for forming
-    // sub-classes.  Note the ordering provided by this key is not the same as
-    // the topological order used for the EnumValues.
-    struct Key {
-      const CodeGenRegister::Vec *Members;
-      RegSizeInfoByHwMode RSI;
+  // A key representing the parts of a register class used for forming
+  // sub-classes.  Note the ordering provided by this key is not the same as
+  // the topological order used for the EnumValues.
+  struct Key {
+    const CodeGenRegister::Vec *Members;
+    RegSizeInfoByHwMode RSI;
 
-      Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I)
+    Key(const CodeGenRegister::Vec *M, const RegSizeInfoByHwMode &I)
         : Members(M), RSI(I) {}
 
-      Key(const CodeGenRegisterClass &RC)
+    Key(const CodeGenRegisterClass &RC)
         : Members(&RC.getMembers()), RSI(RC.RSI) {}
 
-      // Lexicographical order of (Members, RegSizeInfoByHwMode).
-      bool operator<(const Key&) const;
-    };
-
-    // Create a non-user defined register class.
-    CodeGenRegisterClass(CodeGenRegBank&, StringRef Name, Key Props);
-
-    // Called by CodeGenRegBank::CodeGenRegBank().
-    static void computeSubClasses(CodeGenRegBank&);
-
-    // Get ordering value among register base classes.
-    std::optional<int> getBaseClassOrder() const {
-      if (TheDef && !TheDef->isValueUnset("BaseClassOrder"))
-        return TheDef->getValueAsInt("BaseClassOrder");
-      return {};
-    }
+    // Lexicographical order of (Members, RegSizeInfoByHwMode).
+    bool operator<(const Key &) const;
   };
 
-  // Register categories are used when we need to deterine the category a
-  // register falls into (GPR, vector, fixed, etc.) without having to know
-  // specific information about the target architecture.
-  class CodeGenRegisterCategory {
-    Record *TheDef;
-    std::string Name;
-    std::list<CodeGenRegisterClass *> Classes;
+  // Create a non-user defined register class.
+  CodeGenRegisterClass(CodeGenRegBank &, StringRef Name, Key Props);
 
-  public:
-    CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
-    CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
+  // Called by CodeGenRegBank::CodeGenRegBank().
+  static void computeSubClasses(CodeGenRegBank &);
 
-    // Return the Record that defined this class, or NULL if the class was
-    // created by TableGen.
-    Record *getDef() const { return TheDef; }
-
-    std::string getName() const { return Name; }
-    std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
-  };
+  // Get ordering value among register base classes.
+  std::optional<int> getBaseClassOrder() const {
+    if (TheDef && !TheDef->isValueUnset("BaseClassOrder"))
+      return TheDef->getValueAsInt("BaseClassOrder");
+    return {};
+  }
+};
+
+// Register categories are used when we need to deterine the category a
+// register falls into (GPR, vector, fixed, etc.) without having to know
+// specific information about the target architecture.
+class CodeGenRegisterCategory {
+  Record *TheDef;
+  std::string Name;
+  std::list<CodeGenRegisterClass *> Classes;
+
+public:
+  CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
+  CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
+
+  // Return the Record that defined this class, or NULL if the class was
+  // created by TableGen.
+  Record *getDef() const { return TheDef; }
+
+  std::string getName() const { return Name; }
+  std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
+};
+
+// Register units are used to model interference and register pressure.
+// Every register is assigned one or more register units such that two
+// registers overlap if and only if they have a register unit in common.
+//
+// Normally, one register unit is created per leaf register. Non-leaf
+// registers inherit the units of their sub-registers.
+struct RegUnit {
+  // Weight assigned to this RegUnit for estimating register pressure.
+  // This is useful when equalizing weights in register classes with mixed
+  // register topologies.
+  unsigned Weight;
+
+  // Each native RegUnit corresponds to one or two root registers. The full
+  // set of registers containing this unit can be computed as the union of
+  // these two registers and their super-registers.
+  const CodeGenRegister *Roots[2];
+
+  // Index into RegClassUnitSets where we can find the list of UnitSets that
+  // contain this unit.
+  unsigned RegClassUnitSetsIdx;
+  // A register unit is artificial if at least one of its roots is
+  // artificial.
+  bool Artificial;
+
+  RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
+    Roots[0] = Roots[1] = nullptr;
+  }
 
-  // Register units are used to model interference and register pressure.
-  // Every register is assigned one or more register units such that two
-  // registers overlap if and only if they have a register unit in common.
-  //
-  // Normally, one register unit is created per leaf register. Non-leaf
-  // registers inherit the units of their sub-registers.
-  struct RegUnit {
-    // Weight assigned to this RegUnit for estimating register pressure.
-    // This is useful when equalizing weights in register classes with mixed
-    // register topologies.
-    unsigned Weight;
-
-    // Each native RegUnit corresponds to one or two root registers. The full
-    // set of registers containing this unit can be computed as the union of
-    // these two registers and their super-registers.
-    const CodeGenRegister *Roots[2];
-
-    // Index into RegClassUnitSets where we can find the list of UnitSets that
-    // contain this unit.
-    unsigned RegClassUnitSetsIdx;
-    // A register unit is artificial if at least one of its roots is
-    // artificial.
-    bool Artificial;
-
-    RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
-      Roots[0] = Roots[1] = nullptr;
-    }
+  ArrayRef<const CodeGenRegister *> getRoots() const {
+    assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
+    return ArrayRef(Roots, !!Roots[0] + !!Roots[1]);
+  }
+};
 
-    ArrayRef<const CodeGenRegister*> getRoots() const {
-      assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
-      return ArrayRef(Roots, !!Roots[0] + !!Roots[1]);
-    }
-  };
+// Each RegUnitSet is a sorted vector with a name.
+struct RegUnitSet {
+  typedef std::vector<unsigned>::const_iterator iterator;
 
-  // Each RegUnitSet is a sorted vector with a name.
-  struct RegUnitSet {
-    typedef std::vector<unsigned>::const_iterator iterator;
+  std::string Name;
+  std::vector<unsigned> Units;
+  unsigned Weight = 0; // Cache the sum of all unit weights.
+  unsigned Order = 0;  // Cache the sort key.
 
-    std::string Name;
-    std::vector<unsigned> Units;
-    unsigned Weight = 0; // Cache the sum of all unit weights.
-    unsigned Order = 0;  // Cache the sort key.
+  RegUnitSet() = default;
+};
 
-    RegUnitSet() = default;
-  };
+// Base vector for identifying TopoSigs. The contents uniquely identify a
+// TopoSig, only computeSuperRegs needs to know how.
+typedef SmallVector<unsigned, 16> TopoSigId;
 
-  // Base vector for identifying TopoSigs. The contents uniquely identify a
-  // TopoSig, only computeSuperRegs needs to know how.
-  typedef SmallVector<unsigned, 16> TopoSigId;
+// CodeGenRegBank - Represent a target's registers and the relations between
+// them.
+class CodeGenRegBank {
+  SetTheory Sets;
 
-  // CodeGenRegBank - Represent a target's registers and the relations between
-  // them.
-  class CodeGenRegBank {
-    SetTheory Sets;
+  const CodeGenHwModes &CGH;
 
-    const CodeGenHwModes &CGH;
+  std::deque<CodeGenSubRegIndex> SubRegIndices;
+  DenseMap<Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
 
-    std::deque<CodeGenSubRegIndex> SubRegIndices;
-    DenseMap<Record*, CodeGenSubRegIndex*> Def2SubRegIdx;
+  CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
 
-    CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
+  typedef std::map<SmallVector<CodeGenSubRegIndex *, 8>, CodeGenSubRegIndex *>
+      ConcatIdxMap;
+  ConcatIdxMap ConcatIdx;
 
-    typedef std::map<SmallVector<CodeGenSubRegIndex*, 8>,
-                     CodeGenSubRegIndex*> ConcatIdxMap;
-    ConcatIdxMap ConcatIdx;
+  // Registers.
+  std::deque<CodeGenRegister> Registers;
+  StringMap<CodeGenRegister *> RegistersByName;
+  DenseMap<Record *, CodeGenRegister *> Def2Reg;
+  unsigned NumNativeRegUnits;
 
-    // Registers.
-    std::deque<CodeGenRegister> Registers;
-    StringMap<CodeGenRegister*> RegistersByName;
-    DenseMap<Record*, CodeGenRegister*> Def2Reg;
-    unsigned NumNativeRegUnits;
+  std::map<TopoSigId, unsigned> TopoSigs;
 
-    std::map<TopoSigId, unsigned> TopoSigs;
+  // Includes native (0..NumNativeRegUnits-1) and adopted register units.
+  SmallVector<RegUnit, 8> RegUnits;
 
-    // Includes native (0..NumNativeRegUnits-1) and adopted register units.
-    SmallVector<RegUnit, 8> RegUnits;
+  // Register classes.
+  std::list<CodeGenRegisterClass> RegClasses;
+  DenseMap<Record *, CodeGenRegisterClass *> Def2RC;
+  typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass *> RCKeyMap;
+  RCKeyMap Key2RC;
 
-    // Register classes.
-    std::list<CodeGenRegisterClass> RegClasses;
-    DenseMap<Record*, CodeGenRegisterClass*> Def2RC;
-    typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass*> RCKeyMap;
-    RCKeyMap Key2RC;
+  // Register categories.
+  std::list<CodeGenRegisterCategory> RegCategories;
+  DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
+  using RCatKeyMap =
+      std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
+  RCatKeyMap Key2RCat;
 
-    // Register categories.
-    std::list<CodeGenRegisterCategory> RegCategories;
-    DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
-    using RCatKeyMap =
-        std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
-    RCatKeyMap Key2RCat;
+  // Remember each unique set of register units. Initially, this contains a
+  // unique set for each register class. Simliar sets are coalesced with
+  // pruneUnitSets and new supersets are inferred during computeRegUnitSets.
+  std::vector<RegUnitSet> RegUnitSets;
 
-    // Remember each unique set of register units. Initially, this contains a
-    // unique set for each register class. Simliar sets are coalesced with
-    // pruneUnitSets and new supersets are inferred during computeRegUnitSets.
-    std::vector<RegUnitSet> RegUnitSets;
+  // Map RegisterClass index to the index of the RegUnitSet that contains the
+  // class's units and any inferred RegUnit supersets.
+  //
+  // NOTE: This could grow beyond the number of register classes when we map
+  // register units to lists of unit sets. If the list of unit sets does not
+  // already exist for a register class, we create a new entry in this vector.
+  std::vector<std::vector<unsigned>> RegClassUnitSets;
 
-    // Map RegisterClass index to the index of the RegUnitSet that contains the
-    // class's units and any inferred RegUnit supersets.
-    //
-    // NOTE: This could grow beyond the number of register classes when we map
-    // register units to lists of unit sets. If the list of unit sets does not
-    // already exist for a register class, we create a new entry in this vector.
-    std::vector<std::vector<unsigned>> RegClassUnitSets;
+  // Give each register unit set an order based on sorting criteria.
+  std::vector<unsigned> RegUnitSetOrder;
 
-    // Give each register unit set an order based on sorting criteria.
-    std::vector<unsigned> RegUnitSetOrder;
+  // Keep track of synthesized definitions generated in TupleExpander.
+  std::vector<std::unique_ptr<Record>> SynthDefs;
 
-    // Keep track of synthesized definitions generated in TupleExpander.
-    std::vector<std::unique_ptr<Record>> SynthDefs;
+  // Add RC to *2RC maps.
+  void addToMaps(CodeGenRegisterClass *);
 
-    // Add RC to *2RC maps.
-    void addToMaps(CodeGenRegisterClass*);
+  // Create a synthetic sub-class if it is missing.
+  CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC,
+                                            const CodeGenRegister::Vec *Membs,
+                                            StringRef Name);
 
-    // Create a synthetic sub-class if it is missing.
-    CodeGenRegisterClass *getOrCreateSubClass(const CodeGenRegisterClass *RC,
-                                              const CodeGenRegister::Vec *Membs,
-                                              StringRef Name);
+  // Infer missing register classes.
+  void computeInferredRegisterClasses();
+  void inferCommonSubClass(CodeGenRegisterClass *RC);
+  void inferSubClassWithSubReg(CodeGenRegisterClass *RC);
 
-    // Infer missing register classes.
-    void computeInferredRegisterClasses();
-    void inferCommonSubClass(CodeGenRegisterClass *RC);
-    void inferSubClassWithSubReg(CodeGenRegisterClass *RC);
+  void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) {
+    inferMatchingSuperRegClass(RC, RegClasses.begin());
+  }
 
-    void inferMatchingSuperRegClass(CodeGenRegisterClass *RC) {
-      inferMatchingSuperRegClass(RC, RegClasses.begin());
-    }
+  void inferMatchingSuperRegClass(
+      CodeGenRegisterClass *RC,
+      std::list<CodeGenRegisterClass>::iterator FirstSubRegRC);
 
-    void inferMatchingSuperRegClass(
-        CodeGenRegisterClass *RC,
-        std::list<CodeGenRegisterClass>::iterator FirstSubRegRC);
+  // Iteratively prune unit sets.
+  void pruneUnitSets();
 
-    // Iteratively prune unit sets.
-    void pruneUnitSets();
+  // Compute a weight for each register unit created during getSubRegs.
+  void computeRegUnitWeights();
 
-    // Compute a weight for each register unit created during getSubRegs.
-    void computeRegUnitWeights();
+  // Create a RegUnitSet for each RegClass and infer superclasses.
+  void computeRegUnitSets();
 
-    // Create a RegUnitSet for each RegClass and infer superclasses.
-    void computeRegUnitSets();
+  // Populate the Composite map from sub-register relationships.
+  void computeComposites();
 
-    // Populate the Composite map from sub-register relationships.
-    void computeComposites();
+  // Compute a lane mask for each sub-register index.
+  void computeSubRegLaneMasks();
 
-    // Compute a lane mask for each sub-register index.
-    void computeSubRegLaneMasks();
+  /// Computes a lane mask for each register unit enumerated by a physical
+  /// register.
+  void computeRegUnitLaneMasks();
 
-    /// Computes a lane mask for each register unit enumerated by a physical
-    /// register.
-    void computeRegUnitLaneMasks();
+public:
+  CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &);
+  CodeGenRegBank(CodeGenRegBank &) = delete;
 
-  public:
-    CodeGenRegBank(RecordKeeper&, const CodeGenHwModes&);
-    CodeGenRegBank(CodeGenRegBank&) = delete;
+  SetTheory &getSets() { return Sets; }
 
-    SetTheory &getSets() { return Sets; }
+  const CodeGenHwModes &getHwModes() const { return CGH; }
 
-    const CodeGenHwModes &getHwModes() const { return CGH; }
+  // Sub-register indices. The first NumNamedIndices are defined by the user
+  // in the .td files. The rest are synthesized such that all sub-registers
+  // have a unique name.
+  const std::deque<CodeGenSubRegIndex> &getSubRegIndices() const {
+    return SubRegIndices;
+  }
 
-    // Sub-register indices. The first NumNamedIndices are defined by the user
-    // in the .td files. The rest are synthesized such that all sub-registers
-    // have a unique name.
-    const std::deque<CodeGenSubRegIndex> &getSubRegIndices() const {
-      return SubRegIndices;
-    }
+  // Find a SubRegIndex from its Record def or add to the list if it does
+  // not exist there yet.
+  CodeGenSubRegIndex *getSubRegIdx(Record *);
 
-    // Find a SubRegIndex from its Record def or add to the list if it does
-    // not exist there yet.
-    CodeGenSubRegIndex *getSubRegIdx(Record*);
+  // Find a SubRegIndex from its Record def.
+  const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const;
 
-    // Find a SubRegIndex from its Record def.
-    const CodeGenSubRegIndex *findSubRegIdx(const Record* Def) const;
+  // Find or create a sub-register index representing the A+B composition.
+  CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A,
+                                              CodeGenSubRegIndex *B);
 
-    // Find or create a sub-register index representing the A+B composition.
-    CodeGenSubRegIndex *getCompositeSubRegIndex(CodeGenSubRegIndex *A,
-                                                CodeGenSubRegIndex *B);
+  // Find or create a sub-register index representing the concatenation of
+  // non-overlapping sibling indices.
+  CodeGenSubRegIndex *
+  getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8> &);
 
-    // Find or create a sub-register index representing the concatenation of
-    // non-overlapping sibling indices.
-    CodeGenSubRegIndex *
-      getConcatSubRegIndex(const SmallVector<CodeGenSubRegIndex *, 8>&);
+  const std::deque<CodeGenRegister> &getRegisters() const { return Registers; }
 
-    const std::deque<CodeGenRegister> &getRegisters() const {
-      return Registers;
-    }
+  const StringMap<CodeGenRegister *> &getRegistersByName() const {
+    return RegistersByName;
+  }
 
-    const StringMap<CodeGenRegister *> &getRegistersByName() const {
-      return RegistersByName;
-    }
+  // Find a register from its Record def.
+  CodeGenRegister *getReg(Record *);
 
-    // Find a register from its Record def.
-    CodeGenRegister *getReg(Record*);
+  // Get a Register's index into the Registers array.
+  unsigned getRegIndex(const CodeGenRegister *Reg) const {
+    return Reg->EnumValue - 1;
+  }
 
-    // Get a Register's index into the Registers array.
-    unsigned getRegIndex(const CodeGenRegister *Reg) const {
-      return Reg->EnumValue - 1;
-    }
+  // Return the number of allocated TopoSigs. The first TopoSig representing
+  // leaf registers is allocated number 0.
+  unsigned getNumTopoSigs() const { return TopoSigs.size(); }
 
-    // Return the number of allocated TopoSigs. The first TopoSig representing
-    // leaf registers is allocated number 0.
-    unsigned getNumTopoSigs() const {
-      return TopoSigs.size();
-    }
+  // Find or create a TopoSig for the given TopoSigId.
+  // This function is only for use by CodeGenRegister::computeSuperRegs().
+  // Others should simply use Reg->getTopoSig().
+  unsigned getTopoSig(const TopoSigId &Id) {
+    return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second;
+  }
 
-    // Find or create a TopoSig for the given TopoSigId.
-    // This function is only for use by CodeGenRegister::computeSuperRegs().
-    // Others should simply use Reg->getTopoSig().
-    unsigned getTopoSig(const TopoSigId &Id) {
-      return TopoSigs.insert(std::make_pair(Id, TopoSigs.size())).first->second;
-    }
+  // Create a native register unit that is associated with one or two root
+  // registers.
+  unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
+    RegUnit &RU = RegUnits.emplace_back();
+    RU.Roots[0] = R0;
+    RU.Roots[1] = R1;
+    RU.Artificial = R0->Artificial;
+    if (R1)
+      RU.Artificial |= R1->Artificial;
+    return RegUnits.size() - 1;
+  }
 
-    // Create a native register unit that is associated with one or two root
-    // registers.
-    unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
-      RegUnits.resize(RegUnits.size() + 1);
-      RegUnit &RU = RegUnits.back();
-      RU.Roots[0] = R0;
-      RU.Roots[1] = R1;
-      RU.Artificial = R0->Artificial;
-      if (R1)
-        RU.Artificial |= R1->Artificial;
-      return RegUnits.size() - 1;
-    }
+  // Create a new non-native register unit that can be adopted by a register
+  // to increase its pressure. Note that NumNativeRegUnits is not increased.
+  unsigned newRegUnit(unsigned Weight) {
+    RegUnit &RU = RegUnits.emplace_back();
+    RU.Weight = Weight;
+    return RegUnits.size() - 1;
+  }
 
-    // Create a new non-native register unit that can be adopted by a register
-    // to increase its pressure. Note that NumNativeRegUnits is not increased.
-    unsigned newRegUnit(unsigned Weight) {
-      RegUnits.resize(RegUnits.size() + 1);
-      RegUnits.back().Weight = Weight;
-      return RegUnits.size() - 1;
-    }
+  // Native units are the singular unit of a leaf register. Register aliasing
+  // is completely characterized by native units. Adopted units exist to give
+  // register additional weight but don't affect aliasing.
+  bool isNativeUnit(unsigned RUID) const { return RUID < NumNativeRegUnits; }
 
-    // Native units are the singular unit of a leaf register. Register aliasing
-    // is completely characterized by native units. Adopted units exist to give
-    // register additional weight but don't affect aliasing.
-    bool isNativeUnit(unsigned RUID) const {
-      return RUID < NumNativeRegUnits;
-    }
+  unsigned getNumNativeRegUnits() const { return NumNativeRegUnits; }
 
-    unsigned getNumNativeRegUnits() const {
-      return NumNativeRegUnits;
-    }
+  RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
+  const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
 
-    RegUnit &getRegUnit(unsigned RUID) { return RegUnits[RUID]; }
-    const RegUnit &getRegUnit(unsigned RUID) const { return RegUnits[RUID]; }
+  std::list<CodeGenRegisterClass> &getRegClasses() { return RegClasses; }
 
-    std::list<CodeGenRegisterClass> &getRegClasses() { return RegClasses; }
+  const std::list<CodeGenRegisterClass> &getRegClasses() const {
+    return RegClasses;
+  }
 
-    const std::list<CodeGenRegisterClass> &getRegClasses() const {
-      return RegClasses;
-    }
+  std::list<CodeGenRegisterCategory> &getRegCategories() {
+    return RegCategories;
+  }
 
-    std::list<CodeGenRegisterCategory> &getRegCategories() {
-      return RegCategories;
-    }
+  const std::list<CodeGenRegisterCategory> &getRegCategories() const {
+    return RegCategories;
+  }
 
-    const std::list<CodeGenRegisterCategory> &getRegCategories() const {
-      return RegCategories;
-    }
+  // Find a register class from its def.
+  CodeGenRegisterClass *getRegClass(const Record *) const;
+
+  /// getRegisterClassForRegister - Find the register class that contains the
+  /// specified physical register.  If the register is not in a register
+  /// class, return null. If the register is in multiple classes, and the
+  /// classes have a superset-subset relationship and the same set of types,
+  /// return the superclass.  Otherwise return null.
+  const CodeGenRegisterClass *getRegClassForRegister(Record *R);
+
+  // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
+  // getRegClassForRegister, this tries to find the smallest class containing
+  // the physical register. If \p VT is specified, it will only find classes
+  // with a matching type
+  const CodeGenRegisterClass *
+  getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
+
+  // Get the sum of unit weights.
+  unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
+    unsigned Weight = 0;
+    for (unsigned Unit : Units)
+      Weight += getRegUnit(Unit).Weight;
+    return Weight;
+  }
 
-    // Find a register class from its def.
-    CodeGenRegisterClass *getRegClass(const Record *) const;
-
-    /// getRegisterClassForRegister - Find the register class that contains the
-    /// specified physical register.  If the register is not in a register
-    /// class, return null. If the register is in multiple classes, and the
-    /// classes have a superset-subset relationship and the same set of types,
-    /// return the superclass.  Otherwise return null.
-    const CodeGenRegisterClass* getRegClassForRegister(Record *R);
-
-    // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
-    // getRegClassForRegister, this tries to find the smallest class containing
-    // the physical register. If \p VT is specified, it will only find classes
-    // with a matching type
-    const CodeGenRegisterClass *
-    getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
-
-    // Get the sum of unit weights.
-    unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
-      unsigned Weight = 0;
-      for (unsigned Unit : Units)
-        Weight += getRegUnit(Unit).Weight;
-      return Weight;
-    }
+  unsigned getRegSetIDAt(unsigned Order) const {
+    return RegUnitSetOrder[Order];
+  }
 
-    unsigned getRegSetIDAt(unsigned Order) const {
-      return RegUnitSetOrder[Order];
-    }
+  const RegUnitSet &getRegSetAt(unsigned Order) const {
+    return RegUnitSets[RegUnitSetOrder[Order]];
+  }
 
-    const RegUnitSet &getRegSetAt(unsigned Order) const {
-      return RegUnitSets[RegUnitSetOrder[Order]];
-    }
+  // Increase a RegUnitWeight.
+  void increaseRegUnitWeight(unsigned RUID, unsigned Inc) {
+    getRegUnit(RUID).Weight += Inc;
+  }
 
-    // Increase a RegUnitWeight.
-    void increaseRegUnitWeight(unsigned RUID, unsigned Inc) {
-      getRegUnit(RUID).Weight += Inc;
-    }
+  // Get the number of register pressure dimensions.
+  unsigned getNumRegPressureSets() const { return RegUnitSets.size(); }
 
-    // Get the number of register pressure dimensions.
-    unsigned getNumRegPressureSets() const { return RegUnitSets.size(); }
+  // Get a set of register unit IDs for a given dimension of pressure.
+  const RegUnitSet &getRegPressureSet(unsigned Idx) const {
+    return RegUnitSets[Idx];
+  }
 
-    // Get a set of register unit IDs for a given dimension of pressure.
-    const RegUnitSet &getRegPressureSet(unsigned Idx) const {
-      return RegUnitSets[Idx];
-    }
+  // The number of pressure set lists may be larget than the number of
+  // register classes if some register units appeared in a list of sets that
+  // did not correspond to an existing register class.
+  unsigned getNumRegClassPressureSetLists() const {
+    return RegClassUnitSets.size();
+  }
 
-    // The number of pressure set lists may be larget than the number of
-    // register classes if some register units appeared in a list of sets that
-    // did not correspond to an existing register class.
-    unsigned getNumRegClassPressureSetLists() const {
-      return RegClassUnitSets.size();
-    }
+  // Get a list of pressure set IDs for a register class. Liveness of a
+  // register in this class impacts each pressure set in this list by the
+  // weight of the register. An exact solution requires all registers in a
+  // class to have the same class, but it is not strictly guaranteed.
+  ArrayRef<unsigned> getRCPressureSetIDs(unsigned RCIdx) const {
+    return RegClassUnitSets[RCIdx];
+  }
 
-    // Get a list of pressure set IDs for a register class. Liveness of a
-    // register in this class impacts each pressure set in this list by the
-    // weight of the register. An exact solution requires all registers in a
-    // class to have the same class, but it is not strictly guaranteed.
-    ArrayRef<unsigned> getRCPressureSetIDs(unsigned RCIdx) const {
-      return RegClassUnitSets[RCIdx];
-    }
+  // Computed derived records such as missing sub-register indices.
+  void computeDerivedInfo();
 
-    // Computed derived records such as missing sub-register indices.
-    void computeDerivedInfo();
-
-    // Compute the set of registers completely covered by the registers in Regs.
-    // The returned BitVector will have a bit set for each register in Regs,
-    // all sub-registers, and all super-registers that are covered by the
-    // registers in Regs.
-    //
-    // This is used to compute the mask of call-preserved registers from a list
-    // of callee-saves.
-    BitVector computeCoveredRegisters(ArrayRef<Record*> Regs);
-
-    // Bit mask of lanes that cover their registers. A sub-register index whose
-    // LaneMask is contained in CoveringLanes will be completely covered by
-    // another sub-register with the same or larger lane mask.
-    LaneBitmask CoveringLanes;
-
-    // Helper function for printing debug information. Handles artificial
-    // (non-native) reg units.
-    void printRegUnitName(unsigned Unit) const;
-  };
+  // Compute the set of registers completely covered by the registers in Regs.
+  // The returned BitVector will have a bit set for each register in Regs,
+  // all sub-registers, and all super-registers that are covered by the
+  // registers in Regs.
+  //
+  // This is used to compute the mask of call-preserved registers from a list
+  // of callee-saves.
+  BitVector computeCoveredRegisters(ArrayRef<Record *> Regs);
+
+  // Bit mask of lanes that cover their registers. A sub-register index whose
+  // LaneMask is contained in CoveringLanes will be completely covered by
+  // another sub-register with the same or larger lane mask.
+  LaneBitmask CoveringLanes;
+
+  // Helper function for printing debug information. Handles artificial
+  // (non-native) reg units.
+  void printRegUnitName(unsigned Unit) const;
+};
 
 } // end namespace llvm
 
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index 54463da19821..e56bf5bdee63 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -51,7 +51,7 @@ struct InstrsOp : public SetTheory::Operator {
 // (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
 struct InstRegexOp : public SetTheory::Operator {
   const CodeGenTarget &Target;
-  InstRegexOp(const CodeGenTarget &t): Target(t) {}
+  InstRegexOp(const CodeGenTarget &t) : Target(t) {}
 
   /// Remove any text inside of parentheses from S.
   static std::string removeParens(llvm::StringRef S) {
@@ -182,8 +182,8 @@ struct InstRegexOp : public SetTheory::Operator {
 
 /// CodeGenModels ctor interprets machine model records and populates maps.
 CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
-                                       const CodeGenTarget &TGT):
-  Records(RK), Target(TGT) {
+                                       const CodeGenTarget &TGT)
+    : Records(RK), Target(TGT) {
 
   Sets.addFieldExpander("InstRW", "Instrs");
 
@@ -298,9 +298,8 @@ static APInt constructOperandMask(ArrayRef<int64_t> Indices) {
   return OperandMask;
 }
 
-static void
-processSTIPredicate(STIPredicateFunction &Fn,
-                    const ProcModelMapTy &ProcModelMap) {
+static void processSTIPredicate(STIPredicateFunction &Fn,
+                                const ProcModelMapTy &ProcModelMap) {
   DenseMap<const Record *, unsigned> Opcode2Index;
   using OpcodeMapPair = std::pair<const Record *, OpcodeInfo>;
   std::vector<OpcodeMapPair> OpcodeMappings;
@@ -380,30 +379,29 @@ processSTIPredicate(STIPredicateFunction &Fn,
 
   // Sort OpcodeMappings elements based on their CPU and predicate masks.
   // As a last resort, order elements by opcode identifier.
-  llvm::sort(OpcodeMappings,
-             [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) {
-               unsigned LhsIdx = Opcode2Index[Lhs.first];
-               unsigned RhsIdx = Opcode2Index[Rhs.first];
-               const std::pair<APInt, APInt> &LhsMasks = OpcodeMasks[LhsIdx];
-               const std::pair<APInt, APInt> &RhsMasks = OpcodeMasks[RhsIdx];
-
-               auto PopulationCountAndLeftBit =
-                   [](const APInt &Other) -> std::pair<int, int> {
-                 return std::pair<int, int>(Other.popcount(),
-                                            -Other.countl_zero());
-               };
-               auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first);
-               auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first);
-               if (lhsmask_first != rhsmask_first)
-                 return lhsmask_first < rhsmask_first;
-
-               auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second);
-               auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second);
-               if (lhsmask_second != rhsmask_second)
-                 return lhsmask_second < rhsmask_second;
-
-               return LhsIdx < RhsIdx;
-             });
+  llvm::sort(
+      OpcodeMappings, [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) {
+        unsigned LhsIdx = Opcode2Index[Lhs.first];
+        unsigned RhsIdx = Opcode2Index[Rhs.first];
+        const std::pair<APInt, APInt> &LhsMasks = OpcodeMasks[LhsIdx];
+        const std::pair<APInt, APInt> &RhsMasks = OpcodeMasks[RhsIdx];
+
+        auto PopulationCountAndLeftBit =
+            [](const APInt &Other) -> std::pair<int, int> {
+          return std::pair<int, int>(Other.popcount(), -Other.countl_zero());
+        };
+        auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first);
+        auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first);
+        if (lhsmask_first != rhsmask_first)
+          return lhsmask_first < rhsmask_first;
+
+        auto lhsmask_second = PopulationCountAndLeftBit(LhsMasks.second);
+        auto rhsmask_second = PopulationCountAndLeftBit(RhsMasks.second);
+        if (lhsmask_second != rhsmask_second)
+          return lhsmask_second < rhsmask_second;
+
+        return LhsIdx < RhsIdx;
+      });
 
   // Now construct opcode groups. Groups are used by the SubtargetEmitter when
   // expanding the body of a STIPredicate function. In particular, each opcode
@@ -498,8 +496,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() {
     CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel"));
     if (Queue->isSubClassOf("LoadQueue")) {
       if (PM.LoadQueue) {
-        PrintError(Queue->getLoc(),
-                   "Expected a single LoadQueue definition");
+        PrintError(Queue->getLoc(), "Expected a single LoadQueue definition");
         PrintNote(PM.LoadQueue->getLoc(),
                   "Previous definition of LoadQueue was here");
       }
@@ -509,8 +506,7 @@ void CodeGenSchedModels::collectLoadStoreQueueInfo() {
 
     if (Queue->isSubClassOf("StoreQueue")) {
       if (PM.StoreQueue) {
-        PrintError(Queue->getLoc(),
-                   "Expected a single StoreQueue definition");
+        PrintError(Queue->getLoc(), "Expected a single StoreQueue definition");
         PrintNote(PM.StoreQueue->getLoc(),
                   "Previous definition of StoreQueue was here");
       }
@@ -542,14 +538,15 @@ void CodeGenSchedModels::collectProcModels() {
   // Check for duplicated names.
   auto I = std::adjacent_find(ProcRecords.begin(), ProcRecords.end(),
                               [](const Record *Rec1, const Record *Rec2) {
-    return Rec1->getValueAsString("Name") == Rec2->getValueAsString("Name");
-  });
+                                return Rec1->getValueAsString("Name") ==
+                                       Rec2->getValueAsString("Name");
+                              });
   if (I != ProcRecords.end())
     PrintFatalError((*I)->getLoc(), "Duplicate processor name " +
-                    (*I)->getValueAsString("Name"));
+                                        (*I)->getValueAsString("Name"));
 
   // Reserve space because we can. Reallocation would be ok.
-  ProcModels.reserve(ProcRecords.size()+1);
+  ProcModels.reserve(ProcRecords.size() + 1);
 
   // Use idx=0 for NoModel/NoItineraries.
   Record *NoModelDef = Records.getDef("NoSchedModel");
@@ -574,8 +571,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) {
   if (ModelKey->isSubClassOf("SchedMachineModel")) {
     Record *ItinsDef = ModelKey->getValueAsDef("Itineraries");
     ProcModels.emplace_back(ProcModels.size(), Name, ModelKey, ItinsDef);
-  }
-  else {
+  } else {
     // An itinerary is defined without a machine model. Infer a new model.
     if (!ModelKey->getValueAsListOfDefs("IID").empty())
       Name = Name + "Model";
@@ -587,7 +583,7 @@ void CodeGenSchedModels::addProcModel(Record *ProcDef) {
 
 // Recursively find all reachable SchedReadWrite records.
 static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
-                        SmallPtrSet<Record*, 16> &RWSet) {
+                        SmallPtrSet<Record *, 16> &RWSet) {
   if (!RWSet.insert(RWDef).second)
     return;
   RWDefs.push_back(RWDef);
@@ -596,8 +592,7 @@ static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
     RecVec Seq = RWDef->getValueAsListOfDefs("Writes");
     for (Record *WSRec : Seq)
       scanSchedRW(WSRec, RWDefs, RWSet);
-  }
-  else if (RWDef->isSubClassOf("SchedVariant")) {
+  } else if (RWDef->isSubClassOf("SchedVariant")) {
     // Visit each variant (guarded by a different predicate).
     RecVec Vars = RWDef->getValueAsListOfDefs("Variants");
     for (Record *Variant : Vars) {
@@ -616,7 +611,7 @@ void CodeGenSchedModels::collectSchedRW() {
   SchedWrites.resize(1);
   SchedReads.resize(1);
 
-  SmallPtrSet<Record*, 16> RWSet;
+  SmallPtrSet<Record *, 16> RWSet;
 
   // Find all SchedReadWrites referenced by instruction defs.
   RecVec SWDefs, SRDefs;
@@ -673,8 +668,7 @@ void CodeGenSchedModels::collectSchedRW() {
       if (!AliasDef->isSubClassOf("SchedWrite"))
         PrintFatalError(ADef->getLoc(), "SchedWrite Alias must be SchedWrite");
       scanSchedRW(AliasDef, SWDefs, RWSet);
-    }
-    else {
+    } else {
       assert(MatchDef->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
       if (!AliasDef->isSubClassOf("SchedRead"))
         PrintFatalError(ADef->getLoc(), "SchedRead Alias must be SchedRead");
@@ -690,7 +684,7 @@ void CodeGenSchedModels::collectSchedRW() {
   }
   llvm::sort(SRDefs, LessRecord());
   for (Record *SRDef : SRDefs) {
-    assert(!getSchedRWIdx(SRDef, /*IsRead-*/true) && "duplicate SchedWrite");
+    assert(!getSchedRWIdx(SRDef, /*IsRead-*/ true) && "duplicate SchedWrite");
     SchedReads.emplace_back(SchedReads.size(), SRDef);
   }
   // Initialize WriteSequence vectors.
@@ -753,9 +747,9 @@ unsigned CodeGenSchedModels::getSchedRWIdx(const Record *Def,
 }
 
 bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
-  for (auto& ProcModel : ProcModels) {
+  for (auto &ProcModel : ProcModels) {
     const RecVec &RADefs = ProcModel.ReadAdvanceDefs;
-    for (auto& RADef : RADefs) {
+    for (auto &RADef : RADefs) {
       RecVec ValidWrites = RADef->getValueAsListOfDefs("ValidWrites");
       if (is_contained(ValidWrites, WriteDef))
         return true;
@@ -764,8 +758,8 @@ bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
   return false;
 }
 
-static void splitSchedReadWrites(const RecVec &RWDefs,
-                                 RecVec &WriteDefs, RecVec &ReadDefs) {
+static void splitSchedReadWrites(const RecVec &RWDefs, RecVec &WriteDefs,
+                                 RecVec &ReadDefs) {
   for (Record *RWDef : RWDefs) {
     if (RWDef->isSubClassOf("SchedWrite"))
       WriteDefs.push_back(RWDef);
@@ -777,8 +771,8 @@ static void splitSchedReadWrites(const RecVec &RWDefs,
 }
 
 // Split the SchedReadWrites defs and call findRWs for each list.
-void CodeGenSchedModels::findRWs(const RecVec &RWDefs,
-                                 IdxVec &Writes, IdxVec &Reads) const {
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &Writes,
+                                 IdxVec &Reads) const {
   RecVec WriteDefs;
   RecVec ReadDefs;
   splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
@@ -803,8 +797,7 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
     RWSeq.push_back(RWIdx);
     return;
   }
-  int Repeat =
-    SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
+  int Repeat = SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
   for (int i = 0; i < Repeat; ++i) {
     for (unsigned I : SchedRW.Sequence) {
       expandRWSequence(I, RWSeq, IsRead);
@@ -815,8 +808,8 @@ void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
 // Expand a SchedWrite as a sequence following any aliases that coincide with
 // the given processor model.
 void CodeGenSchedModels::expandRWSeqForProc(
-  unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
-  const CodeGenProcModel &ProcModel) const {
+    unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
+    const CodeGenProcModel &ProcModel) const {
 
   const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead);
   Record *AliasDef = nullptr;
@@ -828,14 +821,16 @@ void CodeGenSchedModels::expandRWSeqForProc(
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                      "defined for processor " + ProcModel.ModelName +
-                      " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef) {
-    expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead),
-                       RWSeq, IsRead,ProcModel);
+    expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead), RWSeq, IsRead,
+                       ProcModel);
     return;
   }
   if (!SchedWrite.IsSequence) {
@@ -843,7 +838,7 @@ void CodeGenSchedModels::expandRWSeqForProc(
     return;
   }
   int Repeat =
-    SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
+      SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
   for (int I = 0, E = Repeat; I < E; ++I) {
     for (unsigned Idx : SchedWrite.Sequence) {
       expandRWSeqForProc(Idx, RWSeq, IsRead, ProcModel);
@@ -888,8 +883,7 @@ void CodeGenSchedModels::collectSchedClasses() {
 
   // NoItinerary is always the first class at Idx=0
   assert(SchedClasses.empty() && "Expected empty sched class");
-  SchedClasses.emplace_back(0, "NoInstrModel",
-                            Records.getDef("NoItinerary"));
+  SchedClasses.emplace_back(0, "NoInstrModel", Records.getDef("NoItinerary"));
   SchedClasses.back().ProcIndices.push_back(0);
 
   // Create a SchedClass for each unique combination of itinerary class and
@@ -901,7 +895,7 @@ void CodeGenSchedModels::collectSchedClasses() {
       findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
 
     // ProcIdx == 0 indicates the class applies to all processors.
-    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/{0});
+    unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/ {0});
     InstrClassMap[Inst->TheDef] = SCIdx;
   }
   // Create classes for InstRW defs.
@@ -933,7 +927,8 @@ void CodeGenSchedModels::collectSchedClasses() {
     }
     CodeGenSchedClass &SC = getSchedClass(SCIdx);
     if (SC.ProcIndices[0] != 0)
-      PrintFatalError(Inst->TheDef->getLoc(), "Instruction's sched class "
+      PrintFatalError(Inst->TheDef->getLoc(),
+                      "Instruction's sched class "
                       "must not be subtarget specific.");
 
     IdxVec ProcIndices;
@@ -962,8 +957,7 @@ void CodeGenSchedModels::collectSchedClasses() {
                         << InstName);
       IdxVec Writes;
       IdxVec Reads;
-      findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"),
-              Writes, Reads);
+      findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
       LLVM_DEBUG({
         for (unsigned WIdx : Writes)
           dbgs() << " " << SchedWrites[WIdx].Name;
@@ -1032,25 +1026,23 @@ unsigned CodeGenSchedModels::addSchedClass(Record *ItinClassDef,
   assert(!ProcIndices.empty() && "expect at least one ProcIdx");
 
   auto IsKeyEqual = [=](const CodeGenSchedClass &SC) {
-                     return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads);
-                   };
+    return SC.isKeyEqual(ItinClassDef, OperWrites, OperReads);
+  };
 
   auto I = find_if(make_range(schedClassBegin(), schedClassEnd()), IsKeyEqual);
   unsigned Idx = I == schedClassEnd() ? 0 : std::distance(schedClassBegin(), I);
   if (Idx || SchedClasses[0].isKeyEqual(ItinClassDef, OperWrites, OperReads)) {
     IdxVec PI;
     std::set_union(SchedClasses[Idx].ProcIndices.begin(),
-                   SchedClasses[Idx].ProcIndices.end(),
-                   ProcIndices.begin(), ProcIndices.end(),
-                   std::back_inserter(PI));
+                   SchedClasses[Idx].ProcIndices.end(), ProcIndices.begin(),
+                   ProcIndices.end(), std::back_inserter(PI));
     SchedClasses[Idx].ProcIndices = std::move(PI);
     return Idx;
   }
   Idx = SchedClasses.size();
-  SchedClasses.emplace_back(Idx,
-                            createSchedClassName(ItinClassDef, OperWrites,
-                                                 OperReads),
-                            ItinClassDef);
+  SchedClasses.emplace_back(
+      Idx, createSchedClassName(ItinClassDef, OperWrites, OperReads),
+      ItinClassDef);
   CodeGenSchedClass &SC = SchedClasses.back();
   SC.Writes = OperWrites;
   SC.Reads = OperReads;
@@ -1083,17 +1075,16 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
   // the Instrs to it.
   for (auto &Entry : ClassInstrs) {
     unsigned OldSCIdx = Entry.first;
-    ArrayRef<Record*> InstDefs = Entry.second;
+    ArrayRef<Record *> InstDefs = Entry.second;
     // If the all instrs in the current class are accounted for, then leave
     // them mapped to their old class.
     if (OldSCIdx) {
       const RecVec &RWDefs = SchedClasses[OldSCIdx].InstRWs;
       if (!RWDefs.empty()) {
         const RecVec *OrigInstDefs = Sets.expand(RWDefs[0]);
-        unsigned OrigNumInstrs =
-          count_if(*OrigInstDefs, [&](Record *OIDef) {
-                     return InstrClassMap[OIDef] == OldSCIdx;
-                   });
+        unsigned OrigNumInstrs = count_if(*OrigInstDefs, [&](Record *OIDef) {
+          return InstrClassMap[OIDef] == OldSCIdx;
+        });
         if (OrigNumInstrs == InstDefs.size()) {
           assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 &&
                  "expected a generic SchedClass");
@@ -1148,8 +1139,7 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
                   "\".");
           PrintFatalNote(OldRWDef->getLoc(), "Previous match was here.");
         }
-        assert(OldRWDef != InstRWDef &&
-               "SchedClass has duplicate InstRW def");
+        assert(OldRWDef != InstRWDef && "SchedClass has duplicate InstRW def");
         SC.InstRWs.push_back(OldRWDef);
       }
     }
@@ -1162,7 +1152,8 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
 
 // True if collectProcItins found anything.
 bool CodeGenSchedModels::hasItineraries() const {
-  for (const CodeGenProcModel &PM : make_range(procModelBegin(),procModelEnd()))
+  for (const CodeGenProcModel &PM :
+       make_range(procModelBegin(), procModelEnd()))
     if (PM.hasItineraries())
       return true;
   return false;
@@ -1217,14 +1208,14 @@ void CodeGenSchedModels::collectProcItins() {
 void CodeGenSchedModels::collectProcItinRW() {
   RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
   llvm::sort(ItinRWDefs, LessRecord());
-  for (Record *RWDef  : ItinRWDefs) {
+  for (Record *RWDef : ItinRWDefs) {
     if (!RWDef->getValueInit("SchedModel")->isComplete())
       PrintFatalError(RWDef->getLoc(), "SchedModel is undefined");
     Record *ModelDef = RWDef->getValueAsDef("SchedModel");
     ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
     if (I == ProcModelMap.end()) {
-      PrintFatalError(RWDef->getLoc(), "Undefined SchedMachineModel "
-                    + ModelDef->getName());
+      PrintFatalError(RWDef->getLoc(),
+                      "Undefined SchedMachineModel " + ModelDef->getName());
     }
     ProcModels[I->second].ItinRWDefs.push_back(RWDef);
   }
@@ -1254,10 +1245,10 @@ void CodeGenSchedModels::inferSchedClasses() {
     if (!SchedClasses[Idx].InstRWs.empty())
       inferFromInstRWs(Idx);
     if (!SchedClasses[Idx].Writes.empty()) {
-      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads,
-                  Idx, SchedClasses[Idx].ProcIndices);
+      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads, Idx,
+                  SchedClasses[Idx].ProcIndices);
     }
-    assert(SchedClasses.size() < (NumInstrSchedClasses*6) &&
+    assert(SchedClasses.size() < (NumInstrSchedClasses * 6) &&
            "too many SchedVariants");
   }
 }
@@ -1274,9 +1265,9 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
       if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
-        PrintFatalError(Rec->getLoc(), "Duplicate itinerary class "
-                      + ItinClassDef->getName()
-                      + " in ItinResources for " + PM.ModelName);
+        PrintFatalError(Rec->getLoc(),
+                        "Duplicate itinerary class " + ItinClassDef->getName() +
+                            " in ItinResources for " + PM.ModelName);
       HasMatch = true;
       IdxVec Writes, Reads;
       findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
@@ -1317,8 +1308,8 @@ struct TransVariant {
   unsigned ProcIdx;     // Processor model index or zero for any.
   unsigned TransVecIdx; // Index into PredTransitions::TransVec.
 
-  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti):
-    VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
+  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti)
+      : VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
 };
 
 // Associate a predicate with the SchedReadWrite that it guards.
@@ -1328,15 +1319,16 @@ struct PredCheck {
   unsigned RWIdx;
   Record *Predicate;
 
-  PredCheck(bool r, unsigned w, Record *p): IsRead(r), RWIdx(w), Predicate(p) {}
+  PredCheck(bool r, unsigned w, Record *p)
+      : IsRead(r), RWIdx(w), Predicate(p) {}
 };
 
 // A Predicate transition is a list of RW sequences guarded by a PredTerm.
 struct PredTransition {
   // A predicate term is a conjunction of PredChecks.
   SmallVector<PredCheck, 4> PredTerm;
-  SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
-  SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
+  SmallVector<SmallVector<unsigned, 4>, 16> WriteSequences;
+  SmallVector<SmallVector<unsigned, 4>, 16> ReadSequences;
   unsigned ProcIndex = 0;
 
   PredTransition() = default;
@@ -1354,7 +1346,7 @@ class PredTransitions {
 public:
   std::vector<PredTransition> TransVec;
 
-  PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
+  PredTransitions(CodeGenSchedModels &sm) : SchedModels(sm) {}
 
   bool substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
                                 bool IsRead, unsigned StartIdx);
@@ -1368,9 +1360,8 @@ public:
 private:
   bool mutuallyExclusive(Record *PredDef, ArrayRef<Record *> Preds,
                          ArrayRef<PredCheck> Term);
-  void getIntersectingVariants(
-    const CodeGenSchedRW &SchedRW, unsigned TransIdx,
-    std::vector<TransVariant> &IntersectingVariants);
+  void getIntersectingVariants(const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+                               std::vector<TransVariant> &IntersectingVariants);
   void pushVariant(const TransVariant &VInfo, bool IsRead);
 };
 
@@ -1388,7 +1379,7 @@ private:
 bool PredTransitions::mutuallyExclusive(Record *PredDef,
                                         ArrayRef<Record *> Preds,
                                         ArrayRef<PredCheck> Term) {
-  for (const PredCheck &PC: Term) {
+  for (const PredCheck &PC : Term) {
     if (PC.Predicate == PredDef)
       return false;
 
@@ -1446,8 +1437,8 @@ static std::vector<Record *> getAllPredicates(ArrayRef<TransVariant> Variants,
 // given SchedRW whose processor indices and predicates are not mutually
 // exclusive with the given transition.
 void PredTransitions::getIntersectingVariants(
-  const CodeGenSchedRW &SchedRW, unsigned TransIdx,
-  std::vector<TransVariant> &IntersectingVariants) {
+    const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+    std::vector<TransVariant> &IntersectingVariants) {
 
   bool GenericRW = false;
 
@@ -1489,7 +1480,7 @@ void PredTransitions::getIntersectingVariants(
     }
 
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
 
     if (AliasRW.HasVariants) {
       const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants");
@@ -1516,8 +1507,7 @@ void PredTransitions::getIntersectingVariants(
       // The first variant builds on the existing transition.
       Variant.TransVecIdx = TransIdx;
       IntersectingVariants.push_back(Variant);
-    }
-    else {
+    } else {
       // Push another copy of the current transition for more variants.
       Variant.TransVecIdx = TransVec.size();
       IntersectingVariants.push_back(Variant);
@@ -1525,15 +1515,15 @@ void PredTransitions::getIntersectingVariants(
     }
   }
   if (GenericRW && IntersectingVariants.empty()) {
-    PrintFatalError(SchedRW.TheDef->getLoc(), "No variant of this type has "
+    PrintFatalError(SchedRW.TheDef->getLoc(),
+                    "No variant of this type has "
                     "a matching predicate on any processor");
   }
 }
 
 // Push the Reads/Writes selected by this variant onto the PredTransition
 // specified by VInfo.
-void PredTransitions::
-pushVariant(const TransVariant &VInfo, bool IsRead) {
+void PredTransitions::pushVariant(const TransVariant &VInfo, bool IsRead) {
   PredTransition &Trans = TransVec[VInfo.TransVecIdx];
 
   // If this operand transition is reached through a processor-specific alias,
@@ -1541,11 +1531,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
   IdxVec SelectedRWs;
   if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) {
     Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate");
-    Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx,PredDef);
+    Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx, PredDef);
     RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected");
     SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead);
-  }
-  else {
+  } else {
     assert(VInfo.VarOrSeqDef->isSubClassOf("WriteSequence") &&
            "variant must be a SchedVariant or aliased WriteSequence");
     SelectedRWs.push_back(SchedModels.getSchedRWIdx(VInfo.VarOrSeqDef, IsRead));
@@ -1553,10 +1542,10 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
 
   const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(VInfo.RWIdx, IsRead);
 
-  SmallVectorImpl<SmallVector<unsigned,4>> &RWSequences = IsRead
-    ? Trans.ReadSequences : Trans.WriteSequences;
+  SmallVectorImpl<SmallVector<unsigned, 4>> &RWSequences =
+      IsRead ? Trans.ReadSequences : Trans.WriteSequences;
   if (SchedRW.IsVariadic) {
-    unsigned OperIdx = RWSequences.size()-1;
+    unsigned OperIdx = RWSequences.size() - 1;
     // Make N-1 copies of this transition's last sequence.
     RWSequences.reserve(RWSequences.size() + SelectedRWs.size() - 1);
     RWSequences.insert(RWSequences.end(), SelectedRWs.size() - 1,
@@ -1565,8 +1554,8 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
     // sequence (split the current operand into N operands).
     // Note that write sequences should be expanded within this loop--the entire
     // sequence belongs to a single operand.
-    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
-         RWI != RWE; ++RWI, ++OperIdx) {
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end(); RWI != RWE;
+         ++RWI, ++OperIdx) {
       IdxVec ExpandedRWs;
       if (IsRead)
         ExpandedRWs.push_back(*RWI);
@@ -1575,8 +1564,7 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
       llvm::append_range(RWSequences[OperIdx], ExpandedRWs);
     }
     assert(OperIdx == RWSequences.size() && "missed a sequence");
-  }
-  else {
+  } else {
     // Push this transition's expanded sequence onto this transition's last
     // sequence (add to the current operand's sequence).
     SmallVectorImpl<unsigned> &Seq = RWSequences.back();
@@ -1644,8 +1632,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Visit each original write sequence.
   for (const auto &WriteSequence : Trans.WriteSequences) {
     // Push a new (empty) write sequence onto all partial Transitions.
-    for (std::vector<PredTransition>::iterator I =
-           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+    for (std::vector<PredTransition>::iterator I = TransVec.begin() + StartIdx,
+                                               E = TransVec.end();
+         I != E; ++I) {
       I->WriteSequences.emplace_back();
     }
     Subst |=
@@ -1654,8 +1643,9 @@ bool PredTransitions::substituteVariants(const PredTransition &Trans) {
   // Visit each original read sequence.
   for (const auto &ReadSequence : Trans.ReadSequences) {
     // Push a new (empty) read sequence onto all partial Transitions.
-    for (std::vector<PredTransition>::iterator I =
-           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+    for (std::vector<PredTransition>::iterator I = TransVec.begin() + StartIdx,
+                                               E = TransVec.end();
+         I != E; ++I) {
       I->ReadSequences.emplace_back();
     }
     Subst |= substituteVariantOperand(ReadSequence, /*IsRead=*/true, StartIdx);
@@ -1798,7 +1788,7 @@ void CodeGenSchedModels::inferFromRW(ArrayRef<unsigned> OperWrites,
     for (const PredTransition &Trans : LastTransitions)
       SubstitutedAny |= Transitions.substituteVariants(Trans);
     LLVM_DEBUG(Transitions.dump());
-    LastTransitions.swap(Transitions.TransVec);
+    LastTransitions = std::move(Transitions.TransVec);
   } while (SubstitutedAny);
 
   // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
@@ -1814,7 +1804,7 @@ bool CodeGenSchedModels::hasSuperGroup(RecVec &SubUnits, CodeGenProcModel &PM) {
       continue;
     RecVec SuperUnits = ProcResourceDef->getValueAsListOfDefs("Resources");
     RecIter RI = SubUnits.begin(), RE = SubUnits.end();
-    for ( ; RI != RE; ++RI) {
+    for (; RI != RE; ++RI) {
       if (!is_contained(SuperUnits, *RI)) {
         break;
       }
@@ -1831,22 +1821,22 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) {
     if (!PM.ProcResourceDefs[i]->isSubClassOf("ProcResGroup"))
       continue;
     RecVec CheckUnits =
-      PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources");
-    for (unsigned j = i+1; j < e; ++j) {
+        PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources");
+    for (unsigned j = i + 1; j < e; ++j) {
       if (!PM.ProcResourceDefs[j]->isSubClassOf("ProcResGroup"))
         continue;
       RecVec OtherUnits =
-        PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources");
+          PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources");
       if (std::find_first_of(CheckUnits.begin(), CheckUnits.end(),
-                             OtherUnits.begin(), OtherUnits.end())
-          != CheckUnits.end()) {
+                             OtherUnits.begin(),
+                             OtherUnits.end()) != CheckUnits.end()) {
         // CheckUnits and OtherUnits overlap
         llvm::append_range(OtherUnits, CheckUnits);
         if (!hasSuperGroup(OtherUnits, PM)) {
           PrintFatalError((PM.ProcResourceDefs[i])->getLoc(),
-                          "proc resource group overlaps with "
-                          + PM.ProcResourceDefs[j]->getName()
-                          + " but no supergroup contains both.");
+                          "proc resource group overlaps with " +
+                              PM.ProcResourceDefs[j]->getName() +
+                              " but no supergroup contains both.");
         }
       }
     }
@@ -1862,7 +1852,7 @@ void CodeGenSchedModels::collectRegisterFiles() {
     // For each register file definition, construct a CodeGenRegisterFile object
     // and add it to the appropriate scheduling model.
     CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel"));
-    PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF));
+    PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(), RF));
     CodeGenRegisterFile &CGRF = PM.RegisterFiles.back();
     CGRF.MaxMovesEliminatedPerCycle =
         RF->getValueAsInt("MaxMovesEliminatedPerCycle");
@@ -2013,7 +2003,7 @@ void CodeGenSchedModels::checkCompleteness() {
           PrintError(Inst->TheDef->getLoc(),
                      "No schedule information for instruction '" +
                          Inst->TheDef->getName() + "' in SchedMachineModel '" +
-                     ProcModel.ModelDef->getName() + "'");
+                         ProcModel.ModelDef->getName() + "'");
           Complete = false;
         }
         continue;
@@ -2039,14 +2029,18 @@ void CodeGenSchedModels::checkCompleteness() {
     }
   }
   if (!Complete) {
-    errs() << "\n\nIncomplete schedule models found.\n"
-      << "- Consider setting 'CompleteModel = 0' while developing new models.\n"
-      << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = 1'.\n"
-      << "- Instructions should usually have Sched<[...]> as a superclass, "
-         "you may temporarily use an empty list.\n"
-      << "- Instructions related to unsupported features can be excluded with "
-         "list<Predicate> UnsupportedFeatures = [HasA,..,HasY]; in the "
-         "processor model.\n\n";
+    errs()
+        << "\n\nIncomplete schedule models found.\n"
+        << "- Consider setting 'CompleteModel = 0' while developing new "
+           "models.\n"
+        << "- Pseudo instructions can be marked with 'hasNoSchedulingInfo = "
+           "1'.\n"
+        << "- Instructions should usually have Sched<[...]> as a superclass, "
+           "you may temporarily use an empty list.\n"
+        << "- Instructions related to unsupported features can be excluded "
+           "with "
+           "list<Predicate> UnsupportedFeatures = [HasA,..,HasY]; in the "
+           "processor model.\n\n";
     PrintFatalError("Incomplete schedule model");
   }
 }
@@ -2057,15 +2051,15 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
     const CodeGenProcModel &PM = ProcModels[PIdx];
     // For all ItinRW entries.
     bool HasMatch = false;
-    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
-         II != IE; ++II) {
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end(); II != IE;
+         ++II) {
       RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
       if (!llvm::is_contained(Matched, ItinClassDef))
         continue;
       if (HasMatch)
-        PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
-                        + ItinClassDef->getName()
-                        + " in ItinResources for " + PM.ModelName);
+        PrintFatalError((*II)->getLoc(),
+                        "Duplicate itinerary class " + ItinClassDef->getName() +
+                            " in ItinResources for " + PM.ModelName);
       HasMatch = true;
       IdxVec Writes, Reads;
       findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
@@ -2081,8 +2075,7 @@ void CodeGenSchedModels::collectRWResources(unsigned RWIdx, bool IsRead,
     if (!IsRead && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) {
       for (unsigned Idx : ProcIndices)
         addWriteRes(SchedRW.TheDef, Idx);
-    }
-    else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
+    } else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
       for (unsigned Idx : ProcIndices)
         addReadAdvance(SchedRW.TheDef, Idx);
     }
@@ -2128,31 +2121,30 @@ Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind,
   assert(!ProcResGroups.empty());
 
   for (Record *ProcResDef : ProcResourceDefs) {
-    if (ProcResDef->getValueAsDef("Kind") == ProcResKind
-        && ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) {
+    if (ProcResDef->getValueAsDef("Kind") == ProcResKind &&
+        ProcResDef->getValueAsDef("SchedModel") == PM.ModelDef) {
       if (ProcUnitDef) {
         PrintFatalError(Loc,
-                        "Multiple ProcessorResourceUnits associated with "
-                        + ProcResKind->getName());
+                        "Multiple ProcessorResourceUnits associated with " +
+                            ProcResKind->getName());
       }
       ProcUnitDef = ProcResDef;
     }
   }
   for (Record *ProcResGroup : ProcResGroups) {
-    if (ProcResGroup == ProcResKind
-        && ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) {
+    if (ProcResGroup == ProcResKind &&
+        ProcResGroup->getValueAsDef("SchedModel") == PM.ModelDef) {
       if (ProcUnitDef) {
         PrintFatalError(Loc,
-                        "Multiple ProcessorResourceUnits associated with "
-                        + ProcResKind->getName());
+                        "Multiple ProcessorResourceUnits associated with " +
+                            ProcResKind->getName());
       }
       ProcUnitDef = ProcResGroup;
     }
   }
   if (!ProcUnitDef) {
-    PrintFatalError(Loc,
-                    "No ProcessorResources associated with "
-                    + ProcResKind->getName());
+    PrintFatalError(Loc, "No ProcessorResources associated with " +
+                             ProcResKind->getName());
   }
   return ProcUnitDef;
 }
@@ -2208,14 +2200,16 @@ unsigned CodeGenProcModel::getProcResourceIdx(Record *PRDef) const {
   RecIter PRPos = find(ProcResourceDefs, PRDef);
   if (PRPos == ProcResourceDefs.end())
     PrintFatalError(PRDef->getLoc(), "ProcResource def is not included in "
-                    "the ProcResources list for " + ModelName);
+                                     "the ProcResources list for " +
+                                         ModelName);
   // Idx=0 is reserved for invalid.
   return 1 + (PRPos - ProcResourceDefs.begin());
 }
 
 bool CodeGenProcModel::isUnsupported(const CodeGenInstruction &Inst) const {
   for (const Record *TheDef : UnsupportedFeaturesDefs) {
-    for (const Record *PredDef : Inst.TheDef->getValueAsListOfDefs("Predicates")) {
+    for (const Record *PredDef :
+         Inst.TheDef->getValueAsListOfDefs("Predicates")) {
       if (TheDef->getName() == PredDef->getName())
         return true;
     }
@@ -2239,12 +2233,11 @@ void CodeGenSchedRW::dump() const {
   }
 }
 
-void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
-  dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n'
-         << "  Writes: ";
+void CodeGenSchedClass::dump(const CodeGenSchedModels *SchedModels) const {
+  dbgs() << "SCHEDCLASS " << Index << ":" << Name << '\n' << "  Writes: ";
   for (unsigned i = 0, N = Writes.size(); i < N; ++i) {
     SchedModels->getSchedWrite(Writes[i]).dump();
-    if (i < N-1) {
+    if (i < N - 1) {
       dbgs() << '\n';
       dbgs().indent(10);
     }
@@ -2252,12 +2245,13 @@ void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
   dbgs() << "\n  Reads: ";
   for (unsigned i = 0, N = Reads.size(); i < N; ++i) {
     SchedModels->getSchedRead(Reads[i]).dump();
-    if (i < N-1) {
+    if (i < N - 1) {
       dbgs() << '\n';
       dbgs().indent(10);
     }
   }
-  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices);
+  dbgs() << "\n  ProcIdx: ";
+  dumpIdxVec(ProcIndices);
   if (!Transitions.empty()) {
     dbgs() << "\n Transitions for Proc ";
     for (const CodeGenSchedTransition &Transition : Transitions) {
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index 76ef1e439530..61980e7e196e 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -33,8 +33,8 @@ class CodeGenTarget;
 class CodeGenSchedModels;
 class CodeGenInstruction;
 
-using RecVec = std::vector<Record*>;
-using RecIter = std::vector<Record*>::const_iterator;
+using RecVec = std::vector<Record *>;
+using RecIter = std::vector<Record *>::const_iterator;
 
 using IdxVec = std::vector<unsigned>;
 using IdxIter = std::vector<unsigned>::const_iterator;
@@ -59,10 +59,10 @@ struct CodeGenSchedRW {
   RecVec Aliases;
 
   CodeGenSchedRW()
-    : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false),
-      HasVariants(false), IsVariadic(false), IsSequence(false) {}
+      : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false),
+        HasVariants(false), IsVariadic(false), IsSequence(false) {}
   CodeGenSchedRW(unsigned Idx, Record *Def)
-    : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) {
+      : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) {
     Name = std::string(Def->getName());
     IsRead = Def->isSubClassOf("SchedRead");
     HasVariants = Def->isSubClassOf("SchedVariant");
@@ -148,7 +148,7 @@ struct CodeGenSchedClass {
   DenseSet<unsigned> InstRWProcIndices;
 
   CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef)
-    : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
+      : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {}
 
   bool isKeyEqual(Record *IC, ArrayRef<unsigned> W,
                   ArrayRef<unsigned> R) const {
@@ -173,7 +173,8 @@ struct CodeGenRegisterCost {
   Record *RCDef;
   unsigned Cost;
   bool AllowMoveElimination;
-  CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false)
+  CodeGenRegisterCost(Record *RC, unsigned RegisterCost,
+                      bool AllowMoveElim = false)
       : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {}
   CodeGenRegisterCost(const CodeGenRegisterCost &) = default;
   CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete;
@@ -193,12 +194,12 @@ struct CodeGenRegisterFile {
   unsigned NumPhysRegs;
   std::vector<CodeGenRegisterCost> Costs;
 
-  CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0,
+  CodeGenRegisterFile(StringRef name, Record *def,
+                      unsigned MaxMoveElimPerCy = 0,
                       bool AllowZeroMoveElimOnly = false)
       : Name(name), RegisterFileDef(def),
         MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
-        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly),
-        NumPhysRegs(0) {}
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -255,10 +256,9 @@ struct CodeGenProcModel {
   Record *LoadQueue;
   Record *StoreQueue;
 
-  CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
-                   Record *IDef) :
-    Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-    RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
+  CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, Record *IDef)
+      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
+        RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
@@ -443,14 +443,14 @@ class CodeGenSchedModels {
 
   // Map each instruction to its unique SchedClass index considering the
   // combination of it's itinerary class, SchedRW list, and InstRW records.
-  using InstClassMapTy = DenseMap<Record*, unsigned>;
+  using InstClassMapTy = DenseMap<Record *, unsigned>;
   InstClassMapTy InstrClassMap;
 
   std::vector<STIPredicateFunction> STIPredicates;
   std::vector<unsigned> getAllProcIndices() const;
 
 public:
-  CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
+  CodeGenSchedModels(RecordKeeper &RK, const CodeGenTarget &TGT);
 
   // iterator access to the scheduling classes.
   using class_iterator = std::vector<CodeGenSchedClass>::iterator;
@@ -460,10 +460,10 @@ public:
   class_iterator classes_end() { return SchedClasses.end(); }
   const_class_iterator classes_end() const { return SchedClasses.end(); }
   iterator_range<class_iterator> classes() {
-   return make_range(classes_begin(), classes_end());
+    return make_range(classes_begin(), classes_end());
   }
   iterator_range<const_class_iterator> classes() const {
-   return make_range(classes_begin(), classes_end());
+    return make_range(classes_begin(), classes_end());
   }
   iterator_range<class_iterator> explicit_classes() {
     return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
@@ -476,8 +476,8 @@ public:
     Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
     Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
     if (!ItinsDef->getValueAsListOfDefs("IID").empty()) {
-      assert(ModelDef->getValueAsBit("NoModel")
-             && "Itineraries must be defined within SchedMachineModel");
+      assert(ModelDef->getValueAsBit("NoModel") &&
+             "Itineraries must be defined within SchedMachineModel");
       return ItinsDef;
     }
     return ModelDef;
@@ -496,7 +496,7 @@ public:
     return ProcModels[I->second];
   }
   const CodeGenProcModel &getProcModel(Record *ModelDef) const {
-    return const_cast<CodeGenSchedModels*>(this)->getProcModel(ModelDef);
+    return const_cast<CodeGenSchedModels *>(this)->getProcModel(ModelDef);
   }
 
   // Iterate over the unique processor models.
@@ -527,11 +527,11 @@ public:
   CodeGenSchedRW &getSchedRW(Record *Def) {
     bool IsRead = Def->isSubClassOf("SchedRead");
     unsigned Idx = getSchedRWIdx(Def, IsRead);
-    return const_cast<CodeGenSchedRW&>(
-      IsRead ? getSchedRead(Idx) : getSchedWrite(Idx));
+    return const_cast<CodeGenSchedRW &>(IsRead ? getSchedRead(Idx)
+                                               : getSchedWrite(Idx));
   }
   const CodeGenSchedRW &getSchedRW(Record *Def) const {
-    return const_cast<CodeGenSchedModels&>(*this).getSchedRW(Def);
+    return const_cast<CodeGenSchedModels &>(*this).getSchedRW(Def);
   }
 
   unsigned getSchedRWIdx(const Record *Def, bool IsRead) const;
@@ -579,6 +579,7 @@ public:
   ArrayRef<STIPredicateFunction> getSTIPredicates() const {
     return STIPredicates;
   }
+
 private:
   void collectProcModels();
 
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index ceaa51b1dff3..8e2957efe854 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -49,10 +49,14 @@ MVT::SimpleValueType llvm::getValueType(const Record *Rec) {
 
 StringRef llvm::getName(MVT::SimpleValueType T) {
   switch (T) {
-  case MVT::Other:   return "UNKNOWN";
-  case MVT::iPTR:    return "TLI.getPointerTy()";
-  case MVT::iPTRAny: return "TLI.getPointerTy()";
-  default: return getEnumName(T);
+  case MVT::Other:
+    return "UNKNOWN";
+  case MVT::iPTR:
+    return "TLI.getPointerTy()";
+  case MVT::iPTRAny:
+    return "TLI.getPointerTy()";
+  default:
+    return getEnumName(T);
   }
 }
 
@@ -280,12 +284,11 @@ std::string llvm::getQualifiedName(const Record *R) {
   return Namespace + "::" + R->getName().str();
 }
 
-
 /// getTarget - Return the current instance of the Target class.
 ///
 CodeGenTarget::CodeGenTarget(RecordKeeper &records)
-  : Records(records), CGH(records) {
-  std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
+    : Records(records), CGH(records) {
+  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("No 'Target' subclasses defined!");
   if (Targets.size() != 1)
@@ -294,8 +297,7 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
   MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
-CodeGenTarget::~CodeGenTarget() {
-}
+CodeGenTarget::~CodeGenTarget() {}
 
 StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
@@ -331,7 +333,7 @@ bool CodeGenTarget::getAllowRegisterRenaming() const {
 /// getAsmParser - Return the AssemblyParser definition for this target.
 ///
 Record *CodeGenTarget::getAsmParser() const {
-  std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyParsers");
+  std::vector<Record *> LI = TargetRec->getValueAsListOfDefs("AssemblyParsers");
   if (AsmParserNum >= LI.size())
     PrintFatalError("Target does not have an AsmParser #" +
                     Twine(AsmParserNum) + "!");
@@ -342,8 +344,8 @@ Record *CodeGenTarget::getAsmParser() const {
 /// this target.
 ///
 Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
-  std::vector<Record*> LI =
-    TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
+  std::vector<Record *> LI =
+      TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   if (i >= LI.size())
     PrintFatalError("Target does not have an AsmParserVariant #" + Twine(i) +
                     "!");
@@ -354,15 +356,15 @@ Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
 /// available for this target.
 ///
 unsigned CodeGenTarget::getAsmParserVariantCount() const {
-  std::vector<Record*> LI =
-    TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
+  std::vector<Record *> LI =
+      TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   return LI.size();
 }
 
 /// getAsmWriter - Return the AssemblyWriter definition for this target.
 ///
 Record *CodeGenTarget::getAsmWriter() const {
-  std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyWriters");
+  std::vector<Record *> LI = TargetRec->getValueAsListOfDefs("AssemblyWriters");
   if (AsmWriterNum >= LI.size())
     PrintFatalError("Target does not have an AsmWriter #" +
                     Twine(AsmWriterNum) + "!");
@@ -437,8 +439,7 @@ const CodeGenRegisterClass &CodeGenTarget::getRegisterClass(Record *R) const {
   return *getRegBank().getRegClass(R);
 }
 
-std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
-      const {
+std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R) const {
   const CodeGenRegister *Reg = getRegBank().getReg(R);
   std::vector<ValueTypeByHwMode> Result;
   for (const auto &RC : getRegBank().getRegClasses()) {
@@ -454,16 +455,15 @@ std::vector<ValueTypeByHwMode> CodeGenTarget::getRegisterVTs(Record *R)
   return Result;
 }
 
-
 void CodeGenTarget::ReadLegalValueTypes() const {
   for (const auto &RC : getRegBank().getRegClasses())
     llvm::append_range(LegalValueTypes, RC.VTs);
 
   // Remove duplicates.
   llvm::sort(LegalValueTypes);
-  LegalValueTypes.erase(std::unique(LegalValueTypes.begin(),
-                                    LegalValueTypes.end()),
-                        LegalValueTypes.end());
+  LegalValueTypes.erase(
+      std::unique(LegalValueTypes.begin(), LegalValueTypes.end()),
+      LegalValueTypes.end());
 }
 
 CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
@@ -473,7 +473,7 @@ CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
 }
 
 void CodeGenTarget::ReadInstructions() const {
-  std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Insts = Records.getAllDerivedDefinitions("Instruction");
   if (Insts.size() <= 2)
     PrintFatalError("No 'Instruction' subclasses defined!");
 
@@ -482,11 +482,10 @@ void CodeGenTarget::ReadInstructions() const {
     Instructions[Insts[i]] = std::make_unique<CodeGenInstruction>(Insts[i]);
 }
 
-static const CodeGenInstruction *
-GetInstByName(const char *Name,
-              const DenseMap<const Record*,
-                             std::unique_ptr<CodeGenInstruction>> &Insts,
-              RecordKeeper &Records) {
+static const CodeGenInstruction *GetInstByName(
+    const char *Name,
+    const DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>> &Insts,
+    RecordKeeper &Records) {
   const Record *Rec = Records.getDef(Name);
 
   const auto I = Insts.find(Rec);
@@ -545,7 +544,6 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     Inst->EnumVal = Num++;
 }
 
-
 /// isLittleEndianEncoding - Return whether this target encodes its instruction
 /// in little-endian format, i.e. bits laid out in the order [0..n]
 ///
@@ -576,7 +574,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() {
       unsigned bitSwapIdx = numBits - bit - 1;
       Init *OrigBit = BI->getBit(bit);
       Init *BitSwap = BI->getBit(bitSwapIdx);
-      NewBits[bit]        = BitSwap;
+      NewBits[bit] = BitSwap;
       NewBits[bitSwapIdx] = OrigBit;
     }
     if (numBits % 2) {
@@ -605,10 +603,10 @@ bool CodeGenTarget::guessInstructionProperties() const {
 // ComplexPattern implementation
 //
 ComplexPattern::ComplexPattern(Record *R) {
-  Ty          = R->getValueAsDef("Ty");
+  Ty = R->getValueAsDef("Ty");
   NumOperands = R->getValueAsInt("NumOperands");
   SelectFunc = std::string(R->getValueAsString("SelectFunc"));
-  RootNodes   = R->getValueAsListOfDefs("RootNodes");
+  RootNodes = R->getValueAsListOfDefs("RootNodes");
 
   // FIXME: This is a hack to statically increase the priority of patterns which
   // maps a sub-dag to a complex pattern. e.g. favors LEA over ADD. To get best
@@ -623,7 +621,7 @@ ComplexPattern::ComplexPattern(Record *R) {
   // FIXME: Why is this different from parseSDPatternOperatorProperties?
   // Parse the properties.
   Properties = 0;
-  std::vector<Record*> PropList = R->getValueAsListOfDefs("Properties");
+  std::vector<Record *> PropList = R->getValueAsListOfDefs("Properties");
   for (unsigned i = 0, e = PropList.size(); i != e; ++i)
     if (PropList[i]->getName() == "SDNPHasChain") {
       Properties |= 1 << SDNPHasChain;
diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h
index 29f10243e1ef..2ae3a3a2204d 100644
--- a/llvm/utils/TableGen/CodeGenTarget.h
+++ b/llvm/utils/TableGen/CodeGenTarget.h
@@ -58,10 +58,10 @@ class CodeGenTarget {
   RecordKeeper &Records;
   Record *TargetRec;
 
-  mutable DenseMap<const Record*,
-                   std::unique_ptr<CodeGenInstruction>> Instructions;
+  mutable DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>>
+      Instructions;
   mutable std::unique_ptr<CodeGenRegBank> RegBank;
-  mutable std::vector<Record*> RegAltNameIndices;
+  mutable std::vector<Record *> RegAltNameIndices;
   mutable SmallVector<ValueTypeByHwMode, 8> LegalValueTypes;
   CodeGenHwModes CGH;
   std::vector<Record *> MacroFusions;
@@ -75,6 +75,7 @@ class CodeGenTarget {
   mutable StringRef InstNamespace;
   mutable std::vector<const CodeGenInstruction *> InstrsByEnum;
   mutable unsigned NumPseudoInstructions = 0;
+
 public:
   CodeGenTarget(RecordKeeper &Records);
   ~CodeGenTarget();
@@ -130,8 +131,9 @@ public:
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
 
-  const std::vector<Record*> &getRegAltNameIndices() const {
-    if (RegAltNameIndices.empty()) ReadRegAltNameIndices();
+  const std::vector<Record *> &getRegAltNameIndices() const {
+    if (RegAltNameIndices.empty())
+      ReadRegAltNameIndices();
     return RegAltNameIndices;
   }
 
@@ -156,15 +158,17 @@ public:
   const std::vector<Record *> getMacroFusions() const { return MacroFusions; }
 
 private:
-  DenseMap<const Record*, std::unique_ptr<CodeGenInstruction>> &
+  DenseMap<const Record *, std::unique_ptr<CodeGenInstruction>> &
   getInstructions() const {
-    if (Instructions.empty()) ReadInstructions();
+    if (Instructions.empty())
+      ReadInstructions();
     return Instructions;
   }
-public:
 
+public:
   CodeGenInstruction &getInstruction(const Record *InstRec) const {
-    if (Instructions.empty()) ReadInstructions();
+    if (Instructions.empty())
+      ReadInstructions();
     auto I = Instructions.find(InstRec);
     assert(I != Instructions.end() && "Not an instruction");
     return *I->second;
@@ -200,10 +204,11 @@ public:
   }
 
   typedef ArrayRef<const CodeGenInstruction *>::const_iterator inst_iterator;
-  inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();}
+  inst_iterator inst_begin() const {
+    return getInstructionsByEnumValue().begin();
+  }
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
 
-
   /// isLittleEndianEncoding - are instruction bit patterns defined as  [0..n]?
   ///
   bool isLittleEndianEncoding() const;
@@ -226,22 +231,21 @@ class ComplexPattern {
   Record *Ty;
   unsigned NumOperands;
   std::string SelectFunc;
-  std::vector<Record*> RootNodes;
+  std::vector<Record *> RootNodes;
   unsigned Properties; // Node properties
   unsigned Complexity;
+
 public:
   ComplexPattern(Record *R);
 
   Record *getValueType() const { return Ty; }
   unsigned getNumOperands() const { return NumOperands; }
   const std::string &getSelectFunc() const { return SelectFunc; }
-  const std::vector<Record*> &getRootNodes() const {
-    return RootNodes;
-  }
+  const std::vector<Record *> &getRootNodes() const { return RootNodes; }
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
   unsigned getComplexity() const { return Complexity; }
 };
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index eaf7f7f9f0a3..336cee09b90c 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -27,6 +27,7 @@ namespace {
 class DAGISelEmitter {
   RecordKeeper &Records; // Just so we can get at the timing functions.
   CodeGenDAGPatterns CGP;
+
 public:
   explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {}
   void run(raw_ostream &OS);
@@ -37,39 +38,41 @@ public:
 // DAGISelEmitter Helper methods
 //
 
-/// getResultPatternCost - Compute the number of instructions for this pattern.
+/// Compute the number of instructions for this pattern.
 /// This is a temporary hack.  We should really include the instruction
 /// latencies in this calculation.
-static unsigned getResultPatternCost(TreePatternNode *P,
-                                     CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf()) return 0;
+static unsigned getResultPatternCost(TreePatternNode &P,
+                                     const CodeGenDAGPatterns &CGP) {
+  if (P.isLeaf())
+    return 0;
 
   unsigned Cost = 0;
-  Record *Op = P->getOperator();
+  Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost++;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
     if (II.usesCustomInserter)
       Cost += 10;
   }
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-    Cost += getResultPatternCost(P->getChild(i), CGP);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+    Cost += getResultPatternCost(P.getChild(i), CGP);
   return Cost;
 }
 
 /// getResultPatternCodeSize - Compute the code size of instructions for this
 /// pattern.
-static unsigned getResultPatternSize(TreePatternNode *P,
-                                     CodeGenDAGPatterns &CGP) {
-  if (P->isLeaf()) return 0;
+static unsigned getResultPatternSize(TreePatternNode &P,
+                                     const CodeGenDAGPatterns &CGP) {
+  if (P.isLeaf())
+    return 0;
 
   unsigned Cost = 0;
-  Record *Op = P->getOperator();
+  Record *Op = P.getOperator();
   if (Op->isSubClassOf("Instruction")) {
     Cost += Op->getValueAsInt("CodeSize");
   }
-  for (unsigned i = 0, e = P->getNumChildren(); i != e; ++i)
-    Cost += getResultPatternSize(P->getChild(i), CGP);
+  for (unsigned i = 0, e = P.getNumChildren(); i != e; ++i)
+    Cost += getResultPatternSize(P.getChild(i), CGP);
   return Cost;
 }
 
@@ -82,11 +85,11 @@ struct PatternSortingPredicate {
   CodeGenDAGPatterns &CGP;
 
   bool operator()(const PatternToMatch *LHS, const PatternToMatch *RHS) {
-    const TreePatternNode *LT = LHS->getSrcPattern();
-    const TreePatternNode *RT = RHS->getSrcPattern();
+    const TreePatternNode &LT = LHS->getSrcPattern();
+    const TreePatternNode &RT = RHS->getSrcPattern();
 
-    MVT LHSVT = LT->getNumTypes() != 0 ? LT->getSimpleType(0) : MVT::Other;
-    MVT RHSVT = RT->getNumTypes() != 0 ? RT->getSimpleType(0) : MVT::Other;
+    MVT LHSVT = LT.getNumTypes() != 0 ? LT.getSimpleType(0) : MVT::Other;
+    MVT RHSVT = RT.getNumTypes() != 0 ? RT.getSimpleType(0) : MVT::Other;
     if (LHSVT.isVector() != RHSVT.isVector())
       return RHSVT.isVector();
 
@@ -98,19 +101,25 @@ struct PatternSortingPredicate {
     // input over nodes that cover fewer.
     int LHSSize = LHS->getPatternComplexity(CGP);
     int RHSSize = RHS->getPatternComplexity(CGP);
-    if (LHSSize > RHSSize) return true;   // LHS -> bigger -> less cost
-    if (LHSSize < RHSSize) return false;
+    if (LHSSize > RHSSize)
+      return true; // LHS -> bigger -> less cost
+    if (LHSSize < RHSSize)
+      return false;
 
     // If the patterns have equal complexity, compare generated instruction cost
     unsigned LHSCost = getResultPatternCost(LHS->getDstPattern(), CGP);
     unsigned RHSCost = getResultPatternCost(RHS->getDstPattern(), CGP);
-    if (LHSCost < RHSCost) return true;
-    if (LHSCost > RHSCost) return false;
+    if (LHSCost < RHSCost)
+      return true;
+    if (LHSCost > RHSCost)
+      return false;
 
     unsigned LHSPatSize = getResultPatternSize(LHS->getDstPattern(), CGP);
     unsigned RHSPatSize = getResultPatternSize(RHS->getDstPattern(), CGP);
-    if (LHSPatSize < RHSPatSize) return true;
-    if (LHSPatSize > RHSPatSize) return false;
+    if (LHSPatSize < RHSPatSize)
+      return true;
+    if (LHSPatSize > RHSPatSize)
+      return false;
 
     // Sort based on the UID of the pattern, to reflect source order.
     // Note that this is not guaranteed to be unique, since a single source
@@ -122,11 +131,11 @@ struct PatternSortingPredicate {
 };
 } // End anonymous namespace
 
-
 void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Parse patterns");
   emitSourceFileHeader("DAG Instruction Selector for the " +
-                       CGP.getTargetInfo().getName().str() + " target", OS);
+                           CGP.getTargetInfo().getName().str() + " target",
+                       OS);
 
   OS << "// *** NOTE: This file is #included into the middle of the target\n"
      << "// *** instruction selector class.  These functions are really "
@@ -147,15 +156,15 @@ void DAGISelEmitter::run(raw_ostream &OS) {
                   E = CGP.ptm_end();
                   I != E; ++I) {
                errs() << "PATTERN: ";
-               I->getSrcPattern()->dump();
+               I->getSrcPattern().dump();
                errs() << "\nRESULT:  ";
-               I->getDstPattern()->dump();
+               I->getDstPattern().dump();
                errs() << "\n";
              });
 
   // Add all the patterns to a temporary list so we can sort them.
   Records.startTimer("Sort patterns");
-  std::vector<const PatternToMatch*> Patterns;
+  std::vector<const PatternToMatch *> Patterns;
   for (const PatternToMatch &PTM : CGP.ptms())
     Patterns.push_back(&PTM);
 
@@ -167,7 +176,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Convert to matchers");
   SmallVector<Matcher *, 0> PatternMatchers;
   for (const PatternToMatch *PTM : Patterns) {
-    for (unsigned Variant = 0; ; ++Variant) {
+    for (unsigned Variant = 0;; ++Variant) {
       if (Matcher *M = ConvertPatternToMatcher(*PTM, Variant, CGP))
         PatternMatchers.push_back(M);
       else
@@ -181,7 +190,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   Records.startTimer("Optimize matchers");
   OptimizeMatcher(TheMatcher, CGP);
 
-  //Matcher->dump();
+  // Matcher->dump();
 
   Records.startTimer("Emit matcher table");
   EmitMatcherTable(TheMatcher.get(), CGP, OS);
diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp
index 1a5c728fafd9..3298965ab41d 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcher.cpp
@@ -15,11 +15,9 @@
 #include "llvm/TableGen/Record.h"
 using namespace llvm;
 
-void Matcher::anchor() { }
+void Matcher::anchor() {}
 
-void Matcher::dump() const {
-  print(errs(), 0);
-}
+void Matcher::dump() const { print(errs(), 0); }
 
 void Matcher::print(raw_ostream &OS, unsigned indent) const {
   printImpl(OS, indent);
@@ -27,9 +25,7 @@ void Matcher::print(raw_ostream &OS, unsigned indent) const {
     return Next->print(OS, indent);
 }
 
-void Matcher::printOne(raw_ostream &OS) const {
-  printImpl(OS, 0);
-}
+void Matcher::printOne(raw_ostream &OS) const { printImpl(OS, 0); }
 
 /// unlinkNode - Unlink the specified node from this chain.  If Other == this,
 /// we unlink the next pointer and return it.  Otherwise we unlink Other from
@@ -43,7 +39,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) {
   for (; Cur && Cur->getNext() != Other; Cur = Cur->getNext())
     /*empty*/;
 
-  if (!Cur) return nullptr;
+  if (!Cur)
+    return nullptr;
   Cur->takeNext();
   Cur->setNext(Other->takeNext());
   return this;
@@ -55,7 +52,8 @@ Matcher *Matcher::unlinkNode(Matcher *Other) {
 bool Matcher::canMoveBefore(const Matcher *Other) const {
   for (;; Other = Other->getNext()) {
     assert(Other && "Other didn't come before 'this'?");
-    if (this == Other) return true;
+    if (this == Other)
+      return true;
 
     // We have to be able to move this node across the Other node.
     if (!canMoveBeforeNode(Other))
@@ -78,7 +76,6 @@ bool Matcher::canMoveBeforeNode(const Matcher *Other) const {
   return false;
 }
 
-
 ScopeMatcher::~ScopeMatcher() {
   for (Matcher *C : Children)
     delete C;
@@ -96,8 +93,8 @@ SwitchTypeMatcher::~SwitchTypeMatcher() {
 
 CheckPredicateMatcher::CheckPredicateMatcher(
     const TreePredicateFn &pred, const SmallVectorImpl<unsigned> &Ops)
-  : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()),
-    Operands(Ops.begin(), Ops.end()) {}
+    : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()),
+      Operands(Ops.begin(), Ops.end()) {}
 
 TreePredicateFn CheckPredicateMatcher::getPredicate() const {
   return TreePredicateFn(Pred);
@@ -112,16 +109,15 @@ unsigned CheckPredicateMatcher::getOperandNo(unsigned i) const {
   return Operands[i];
 }
 
-
 // printImpl methods.
 
 void ScopeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "Scope\n";
   for (const Matcher *C : Children) {
     if (!C)
-      OS.indent(indent+1) << "NULL POINTER\n";
+      OS.indent(indent + 1) << "NULL POINTER\n";
     else
-      C->print(OS, indent+2);
+      C->print(OS, indent + 2);
   }
 }
 
@@ -137,7 +133,8 @@ void RecordMemRefMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "RecordMemRef\n";
 }
 
-void CaptureGlueInputMatcher::printImpl(raw_ostream &OS, unsigned indent) const{
+void CaptureGlueInputMatcher::printImpl(raw_ostream &OS,
+                                        unsigned indent) const {
   OS.indent(indent) << "CaptureGlueInput\n";
 }
 
@@ -161,8 +158,8 @@ void CheckChildSameMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckChild" << ChildNo << "Same\n";
 }
 
-void CheckPatternPredicateMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void CheckPatternPredicateMatcher::printImpl(raw_ostream &OS,
+                                             unsigned indent) const {
   OS.indent(indent) << "CheckPatternPredicate " << Predicate << '\n';
 }
 
@@ -178,32 +175,30 @@ void SwitchOpcodeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "SwitchOpcode: {\n";
   for (const auto &C : Cases) {
     OS.indent(indent) << "case " << C.first->getEnumName() << ":\n";
-    C.second->print(OS, indent+2);
+    C.second->print(OS, indent + 2);
   }
   OS.indent(indent) << "}\n";
 }
 
-
 void CheckTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo="
-    << ResNo << '\n';
+  OS.indent(indent) << "CheckType " << getEnumName(Type) << ", ResNo=" << ResNo
+                    << '\n';
 }
 
 void SwitchTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "SwitchType: {\n";
   for (const auto &C : Cases) {
     OS.indent(indent) << "case " << getEnumName(C.first) << ":\n";
-    C.second->print(OS, indent+2);
+    C.second->print(OS, indent + 2);
   }
   OS.indent(indent) << "}\n";
 }
 
 void CheckChildTypeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckChildType " << ChildNo << " "
-    << getEnumName(Type) << '\n';
+  OS.indent(indent) << "CheckChildType " << ChildNo << " " << getEnumName(Type)
+                    << '\n';
 }
 
-
 void CheckIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckInteger " << Value << '\n';
 }
@@ -258,8 +253,8 @@ void EmitIntegerMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
                     << '\n';
 }
 
-void EmitStringIntegerMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitStringIntegerMatcher::printImpl(raw_ostream &OS,
+                                         unsigned indent) const {
   OS.indent(indent) << "EmitStringInteger " << Val << " VT=" << getEnumName(VT)
                     << '\n';
 }
@@ -273,13 +268,13 @@ void EmitRegisterMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS << " VT=" << getEnumName(VT) << '\n';
 }
 
-void EmitConvertToTargetMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitConvertToTargetMatcher::printImpl(raw_ostream &OS,
+                                           unsigned indent) const {
   OS.indent(indent) << "EmitConvertToTarget " << Slot << '\n';
 }
 
-void EmitMergeInputChainsMatcher::
-printImpl(raw_ostream &OS, unsigned indent) const {
+void EmitMergeInputChainsMatcher::printImpl(raw_ostream &OS,
+                                            unsigned indent) const {
   OS.indent(indent) << "EmitMergeInputChains <todo: args>\n";
 }
 
@@ -289,10 +284,9 @@ void EmitCopyToRegMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
 
 void EmitNodeXFormMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "EmitNodeXForm " << NodeXForm->getName()
-     << " Slot=" << Slot << '\n';
+                    << " Slot=" << Slot << '\n';
 }
 
-
 void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent);
   OS << (isa<MorphNodeToMatcher>(this) ? "MorphNodeTo: " : "EmitNode: ")
@@ -308,15 +302,15 @@ void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, unsigned indent) const {
 
 void CompleteMatchMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CompleteMatch <todo args>\n";
-  OS.indent(indent) << "Src = " << *Pattern.getSrcPattern() << "\n";
-  OS.indent(indent) << "Dst = " << *Pattern.getDstPattern() << "\n";
+  OS.indent(indent) << "Src = " << Pattern.getSrcPattern() << "\n";
+  OS.indent(indent) << "Dst = " << Pattern.getDstPattern() << "\n";
 }
 
 bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const {
   // Note: pointer equality isn't enough here, we have to check the enum names
   // to ensure that the nodes are for the same opcode.
   return cast<CheckOpcodeMatcher>(M)->Opcode.getEnumName() ==
-          Opcode.getEnumName();
+         Opcode.getEnumName();
 }
 
 bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const {
@@ -327,9 +321,9 @@ bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const {
          M->NumFixedArityOperands == NumFixedArityOperands;
 }
 
-void EmitNodeMatcher::anchor() { }
+void EmitNodeMatcher::anchor() {}
 
-void MorphNodeToMatcher::anchor() { }
+void MorphNodeToMatcher::anchor() {}
 
 // isContradictoryImpl Implementations.
 
@@ -337,7 +331,8 @@ static bool TypesAreContradictory(MVT::SimpleValueType T1,
                                   MVT::SimpleValueType T2) {
   // If the two types are the same, then they are the same, so they don't
   // contradict.
-  if (T1 == T2) return false;
+  if (T1 == T2)
+    return false;
 
   // If either type is about iPtr, then they don't conflict unless the other
   // one is not a scalar integer type.
@@ -400,7 +395,8 @@ bool CheckIntegerMatcher::isContradictoryImpl(const Matcher *M) const {
 }
 
 bool CheckChildIntegerMatcher::isContradictoryImpl(const Matcher *M) const {
-  if (const CheckChildIntegerMatcher *CCIM = dyn_cast<CheckChildIntegerMatcher>(M)) {
+  if (const CheckChildIntegerMatcher *CCIM =
+          dyn_cast<CheckChildIntegerMatcher>(M)) {
     // If the two checks are about different nodes, we don't know if they
     // conflict!
     if (CCIM->getChildNo() != getChildNo())
diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h
index 6615a15e467e..d4fe513e2e96 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/DAGISelMatcher.h
@@ -21,185 +21,186 @@
 #include <utility>
 
 namespace llvm {
-  class CodeGenRegister;
-  class CodeGenDAGPatterns;
-  class CodeGenInstruction;
-  class Matcher;
-  class PatternToMatch;
-  class raw_ostream;
-  class ComplexPattern;
-  class Record;
-  class SDNodeInfo;
-  class TreePredicateFn;
-  class TreePattern;
-
-  Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,
-                                   unsigned Variant,
-                                   const CodeGenDAGPatterns &CGP);
-  void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
-                       const CodeGenDAGPatterns &CGP);
-  void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP,
-                        raw_ostream &OS);
-
-  /// Matcher - Base class for all the DAG ISel Matcher representation
-  /// nodes.
-  class Matcher {
-    // The next matcher node that is executed after this one.  Null if this is
-    // the last stage of a match.
-    std::unique_ptr<Matcher> Next;
-    size_t Size = 0; // Size in bytes of matcher and all its children (if any).
-    virtual void anchor();
-
-  public:
-    enum KindTy {
-      // Matcher state manipulation.
-      Scope,            // Push a checking scope.
-      RecordNode,       // Record the current node.
-      RecordChild,      // Record a child of the current node.
-      RecordMemRef,     // Record the memref in the current node.
-      CaptureGlueInput, // If the current node has an input glue, save it.
-      MoveChild,        // Move current node to specified child.
-      MoveSibling,      // Move current node to specified sibling.
-      MoveParent,       // Move current node to parent.
-
-      // Predicate checking.
-      CheckSame,      // Fail if not same as prev match.
-      CheckChildSame, // Fail if child not same as prev match.
-      CheckPatternPredicate,
-      CheckPredicate,      // Fail if node predicate fails.
-      CheckOpcode,         // Fail if not opcode.
-      SwitchOpcode,        // Dispatch based on opcode.
-      CheckType,           // Fail if not correct type.
-      SwitchType,          // Dispatch based on type.
-      CheckChildType,      // Fail if child has wrong type.
-      CheckInteger,        // Fail if wrong val.
-      CheckChildInteger,   // Fail if child is wrong val.
-      CheckCondCode,       // Fail if not condcode.
-      CheckChild2CondCode, // Fail if child is wrong condcode.
-      CheckValueType,
-      CheckComplexPat,
-      CheckAndImm,
-      CheckOrImm,
-      CheckImmAllOnesV,
-      CheckImmAllZerosV,
-      CheckFoldableChainNode,
-
-      // Node creation/emisssion.
-      EmitInteger,          // Create a TargetConstant
-      EmitStringInteger,    // Create a TargetConstant from a string.
-      EmitRegister,         // Create a register.
-      EmitConvertToTarget,  // Convert a imm/fpimm to target imm/fpimm
-      EmitMergeInputChains, // Merge together a chains for an input.
-      EmitCopyToReg,        // Emit a copytoreg into a physreg.
-      EmitNode,             // Create a DAG node
-      EmitNodeXForm,        // Run a SDNodeXForm
-      CompleteMatch,        // Finish a match and update the results.
-      MorphNodeTo,          // Build a node, finish a match and update results.
-
-      // Highest enum value; watch out when adding more.
-      HighestKind = MorphNodeTo
-    };
-    const KindTy Kind;
-
-  protected:
-    Matcher(KindTy K) : Kind(K) {}
-
-  public:
-    virtual ~Matcher() {}
-
-    unsigned getSize() const { return Size; }
-    void setSize(unsigned sz) { Size = sz; }
-    KindTy getKind() const { return Kind; }
-
-    Matcher *getNext() { return Next.get(); }
-    const Matcher *getNext() const { return Next.get(); }
-    void setNext(Matcher *C) { Next.reset(C); }
-    Matcher *takeNext() { return Next.release(); }
-
-    std::unique_ptr<Matcher> &getNextPtr() { return Next; }
-
-    bool isEqual(const Matcher *M) const {
-      if (getKind() != M->getKind())
-        return false;
-      return isEqualImpl(M);
-    }
+class CodeGenRegister;
+class CodeGenDAGPatterns;
+class CodeGenInstruction;
+class Matcher;
+class PatternToMatch;
+class raw_ostream;
+class ComplexPattern;
+class Record;
+class SDNodeInfo;
+class TreePredicateFn;
+class TreePattern;
+
+Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,
+                                 unsigned Variant,
+                                 const CodeGenDAGPatterns &CGP);
+void OptimizeMatcher(std::unique_ptr<Matcher> &Matcher,
+                     const CodeGenDAGPatterns &CGP);
+void EmitMatcherTable(Matcher *Matcher, const CodeGenDAGPatterns &CGP,
+                      raw_ostream &OS);
+
+/// Matcher - Base class for all the DAG ISel Matcher representation
+/// nodes.
+class Matcher {
+  // The next matcher node that is executed after this one.  Null if this is
+  // the last stage of a match.
+  std::unique_ptr<Matcher> Next;
+  size_t Size = 0; // Size in bytes of matcher and all its children (if any).
+  virtual void anchor();
 
-    /// isSimplePredicateNode - Return true if this is a simple predicate that
-    /// operates on the node or its children without potential side effects or a
-    /// change of the current node.
-    bool isSimplePredicateNode() const {
-      switch (getKind()) {
-      default:
-        return false;
-      case CheckSame:
-      case CheckChildSame:
-      case CheckPatternPredicate:
-      case CheckPredicate:
-      case CheckOpcode:
-      case CheckType:
-      case CheckChildType:
-      case CheckInteger:
-      case CheckChildInteger:
-      case CheckCondCode:
-      case CheckChild2CondCode:
-      case CheckValueType:
-      case CheckAndImm:
-      case CheckOrImm:
-      case CheckImmAllOnesV:
-      case CheckImmAllZerosV:
-      case CheckFoldableChainNode:
-        return true;
-      }
-    }
+public:
+  enum KindTy {
+    // Matcher state manipulation.
+    Scope,            // Push a checking scope.
+    RecordNode,       // Record the current node.
+    RecordChild,      // Record a child of the current node.
+    RecordMemRef,     // Record the memref in the current node.
+    CaptureGlueInput, // If the current node has an input glue, save it.
+    MoveChild,        // Move current node to specified child.
+    MoveSibling,      // Move current node to specified sibling.
+    MoveParent,       // Move current node to parent.
+
+    // Predicate checking.
+    CheckSame,      // Fail if not same as prev match.
+    CheckChildSame, // Fail if child not same as prev match.
+    CheckPatternPredicate,
+    CheckPredicate,      // Fail if node predicate fails.
+    CheckOpcode,         // Fail if not opcode.
+    SwitchOpcode,        // Dispatch based on opcode.
+    CheckType,           // Fail if not correct type.
+    SwitchType,          // Dispatch based on type.
+    CheckChildType,      // Fail if child has wrong type.
+    CheckInteger,        // Fail if wrong val.
+    CheckChildInteger,   // Fail if child is wrong val.
+    CheckCondCode,       // Fail if not condcode.
+    CheckChild2CondCode, // Fail if child is wrong condcode.
+    CheckValueType,
+    CheckComplexPat,
+    CheckAndImm,
+    CheckOrImm,
+    CheckImmAllOnesV,
+    CheckImmAllZerosV,
+    CheckFoldableChainNode,
+
+    // Node creation/emisssion.
+    EmitInteger,          // Create a TargetConstant
+    EmitStringInteger,    // Create a TargetConstant from a string.
+    EmitRegister,         // Create a register.
+    EmitConvertToTarget,  // Convert a imm/fpimm to target imm/fpimm
+    EmitMergeInputChains, // Merge together a chains for an input.
+    EmitCopyToReg,        // Emit a copytoreg into a physreg.
+    EmitNode,             // Create a DAG node
+    EmitNodeXForm,        // Run a SDNodeXForm
+    CompleteMatch,        // Finish a match and update the results.
+    MorphNodeTo,          // Build a node, finish a match and update results.
+
+    // Highest enum value; watch out when adding more.
+    HighestKind = MorphNodeTo
+  };
+  const KindTy Kind;
 
-    /// isSimplePredicateOrRecordNode - Return true if this is a record node or
-    /// a simple predicate.
-    bool isSimplePredicateOrRecordNode() const {
-      return isSimplePredicateNode() || getKind() == RecordNode ||
-             getKind() == RecordChild;
-    }
+protected:
+  Matcher(KindTy K) : Kind(K) {}
 
-    /// unlinkNode - Unlink the specified node from this chain.  If Other ==
-    /// this, we unlink the next pointer and return it.  Otherwise we unlink
-    /// Other from the list and return this.
-    Matcher *unlinkNode(Matcher *Other);
-
-    /// canMoveBefore - Return true if this matcher is the same as Other, or if
-    /// we can move this matcher past all of the nodes in-between Other and this
-    /// node.  Other must be equal to or before this.
-    bool canMoveBefore(const Matcher *Other) const;
-
-    /// canMoveBeforeNode - Return true if it is safe to move the current
-    /// matcher across the specified one.
-    bool canMoveBeforeNode(const Matcher *Other) const;
-
-    /// isContradictory - Return true of these two matchers could never match on
-    /// the same node.
-    bool isContradictory(const Matcher *Other) const {
-      // Since this predicate is reflexive, we canonicalize the ordering so that
-      // we always match a node against nodes with kinds that are greater or
-      // equal to them.  For example, we'll pass in a CheckType node as an
-      // argument to the CheckOpcode method, not the other way around.
-      if (getKind() < Other->getKind())
-        return isContradictoryImpl(Other);
-      return Other->isContradictoryImpl(this);
+public:
+  virtual ~Matcher() {}
+
+  unsigned getSize() const { return Size; }
+  void setSize(unsigned sz) { Size = sz; }
+  KindTy getKind() const { return Kind; }
+
+  Matcher *getNext() { return Next.get(); }
+  const Matcher *getNext() const { return Next.get(); }
+  void setNext(Matcher *C) { Next.reset(C); }
+  Matcher *takeNext() { return Next.release(); }
+
+  std::unique_ptr<Matcher> &getNextPtr() { return Next; }
+
+  bool isEqual(const Matcher *M) const {
+    if (getKind() != M->getKind())
+      return false;
+    return isEqualImpl(M);
+  }
+
+  /// isSimplePredicateNode - Return true if this is a simple predicate that
+  /// operates on the node or its children without potential side effects or a
+  /// change of the current node.
+  bool isSimplePredicateNode() const {
+    switch (getKind()) {
+    default:
+      return false;
+    case CheckSame:
+    case CheckChildSame:
+    case CheckPatternPredicate:
+    case CheckPredicate:
+    case CheckOpcode:
+    case CheckType:
+    case CheckChildType:
+    case CheckInteger:
+    case CheckChildInteger:
+    case CheckCondCode:
+    case CheckChild2CondCode:
+    case CheckValueType:
+    case CheckAndImm:
+    case CheckOrImm:
+    case CheckImmAllOnesV:
+    case CheckImmAllZerosV:
+    case CheckFoldableChainNode:
+      return true;
     }
+  }
 
-    void print(raw_ostream &OS, unsigned indent = 0) const;
-    void printOne(raw_ostream &OS) const;
-    void dump() const;
+  /// isSimplePredicateOrRecordNode - Return true if this is a record node or
+  /// a simple predicate.
+  bool isSimplePredicateOrRecordNode() const {
+    return isSimplePredicateNode() || getKind() == RecordNode ||
+           getKind() == RecordChild;
+  }
 
-  protected:
-    virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0;
-    virtual bool isEqualImpl(const Matcher *M) const = 0;
-    virtual bool isContradictoryImpl(const Matcher *M) const { return false; }
-  };
+  /// unlinkNode - Unlink the specified node from this chain.  If Other ==
+  /// this, we unlink the next pointer and return it.  Otherwise we unlink
+  /// Other from the list and return this.
+  Matcher *unlinkNode(Matcher *Other);
+
+  /// canMoveBefore - Return true if this matcher is the same as Other, or if
+  /// we can move this matcher past all of the nodes in-between Other and this
+  /// node.  Other must be equal to or before this.
+  bool canMoveBefore(const Matcher *Other) const;
+
+  /// canMoveBeforeNode - Return true if it is safe to move the current
+  /// matcher across the specified one.
+  bool canMoveBeforeNode(const Matcher *Other) const;
+
+  /// isContradictory - Return true of these two matchers could never match on
+  /// the same node.
+  bool isContradictory(const Matcher *Other) const {
+    // Since this predicate is reflexive, we canonicalize the ordering so that
+    // we always match a node against nodes with kinds that are greater or
+    // equal to them.  For example, we'll pass in a CheckType node as an
+    // argument to the CheckOpcode method, not the other way around.
+    if (getKind() < Other->getKind())
+      return isContradictoryImpl(Other);
+    return Other->isContradictoryImpl(this);
+  }
+
+  void print(raw_ostream &OS, unsigned indent = 0) const;
+  void printOne(raw_ostream &OS) const;
+  void dump() const;
+
+protected:
+  virtual void printImpl(raw_ostream &OS, unsigned indent) const = 0;
+  virtual bool isEqualImpl(const Matcher *M) const = 0;
+  virtual bool isContradictoryImpl(const Matcher *M) const { return false; }
+};
 
 /// ScopeMatcher - This attempts to match each of its children to find the first
 /// one that successfully matches.  If one child fails, it tries the next child.
 /// If none of the children match then this check fails.  It never has a 'next'.
 class ScopeMatcher : public Matcher {
-  SmallVector<Matcher*, 4> Children;
+  SmallVector<Matcher *, 4> Children;
+
 public:
   ScopeMatcher(SmallVectorImpl<Matcher *> &&children)
       : Matcher(Scope), Children(std::move(children)) {}
@@ -230,9 +231,7 @@ public:
     Children.resize(NC);
   }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == Scope;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == Scope; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -248,16 +247,15 @@ class RecordMatcher : public Matcher {
   /// ResultNo - The slot number in the RecordedNodes vector that this will be,
   /// just printed as a comment.
   unsigned ResultNo;
+
 public:
   RecordMatcher(const std::string &whatfor, unsigned resultNo)
-    : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {}
+      : Matcher(RecordNode), WhatFor(whatfor), ResultNo(resultNo) {}
 
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordNode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordNode; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -277,19 +275,18 @@ class RecordChildMatcher : public Matcher {
   /// ResultNo - The slot number in the RecordedNodes vector that this will be,
   /// just printed as a comment.
   unsigned ResultNo;
+
 public:
   RecordChildMatcher(unsigned childno, const std::string &whatfor,
                      unsigned resultNo)
-  : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor),
-    ResultNo(resultNo) {}
+      : Matcher(RecordChild), ChildNo(childno), WhatFor(whatfor),
+        ResultNo(resultNo) {}
 
   unsigned getChildNo() const { return ChildNo; }
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordChild;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordChild; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -303,16 +300,13 @@ class RecordMemRefMatcher : public Matcher {
 public:
   RecordMemRefMatcher() : Matcher(RecordMemRef) {}
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == RecordMemRef;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == RecordMemRef; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
   bool isEqualImpl(const Matcher *M) const override { return true; }
 };
 
-
 /// CaptureGlueInputMatcher - If the current record has a glue input, record
 /// it so that it is used as an input to the generated code.
 class CaptureGlueInputMatcher : public Matcher {
@@ -332,14 +326,13 @@ private:
 /// specified child node.
 class MoveChildMatcher : public Matcher {
   unsigned ChildNo;
+
 public:
   MoveChildMatcher(unsigned childNo) : Matcher(MoveChild), ChildNo(childNo) {}
 
   unsigned getChildNo() const { return ChildNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MoveChild;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MoveChild; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -374,9 +367,7 @@ class MoveParentMatcher : public Matcher {
 public:
   MoveParentMatcher() : Matcher(MoveParent) {}
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MoveParent;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MoveParent; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -388,15 +379,14 @@ private:
 /// when patterns have the same name in them, like '(mul GPR:$in, GPR:$in)'.
 class CheckSameMatcher : public Matcher {
   unsigned MatchNumber;
+
 public:
   CheckSameMatcher(unsigned matchnumber)
-    : Matcher(CheckSame), MatchNumber(matchnumber) {}
+      : Matcher(CheckSame), MatchNumber(matchnumber) {}
 
   unsigned getMatchNumber() const { return MatchNumber; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckSame;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckSame; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -411,9 +401,10 @@ private:
 class CheckChildSameMatcher : public Matcher {
   unsigned ChildNo;
   unsigned MatchNumber;
+
 public:
   CheckChildSameMatcher(unsigned childno, unsigned matchnumber)
-    : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {}
+      : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {}
 
   unsigned getChildNo() const { return ChildNo; }
   unsigned getMatchNumber() const { return MatchNumber; }
@@ -435,9 +426,10 @@ private:
 /// not take a node as input.  This is used for subtarget feature checks etc.
 class CheckPatternPredicateMatcher : public Matcher {
   std::string Predicate;
+
 public:
   CheckPatternPredicateMatcher(StringRef predicate)
-    : Matcher(CheckPatternPredicate), Predicate(predicate) {}
+      : Matcher(CheckPatternPredicate), Predicate(predicate) {}
 
   StringRef getPredicate() const { return Predicate; }
 
@@ -457,6 +449,7 @@ private:
 class CheckPredicateMatcher : public Matcher {
   TreePattern *Pred;
   const SmallVector<unsigned, 4> Operands;
+
 public:
   CheckPredicateMatcher(const TreePredicateFn &pred,
                         const SmallVectorImpl<unsigned> &Operands);
@@ -476,20 +469,18 @@ private:
   }
 };
 
-
 /// CheckOpcodeMatcher - This checks to see if the current node has the
 /// specified opcode, if not it fails to match.
 class CheckOpcodeMatcher : public Matcher {
   const SDNodeInfo &Opcode;
+
 public:
   CheckOpcodeMatcher(const SDNodeInfo &opcode)
-    : Matcher(CheckOpcode), Opcode(opcode) {}
+      : Matcher(CheckOpcode), Opcode(opcode) {}
 
   const SDNodeInfo &getOpcode() const { return Opcode; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckOpcode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckOpcode; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -502,16 +493,15 @@ private:
 /// then the match fails.  This is semantically equivalent to a Scope node where
 /// every child does a CheckOpcode, but is much faster.
 class SwitchOpcodeMatcher : public Matcher {
-  SmallVector<std::pair<const SDNodeInfo*, Matcher*>, 8> Cases;
+  SmallVector<std::pair<const SDNodeInfo *, Matcher *>, 8> Cases;
+
 public:
   SwitchOpcodeMatcher(
       SmallVectorImpl<std::pair<const SDNodeInfo *, Matcher *>> &&cases)
       : Matcher(SwitchOpcode), Cases(std::move(cases)) {}
   ~SwitchOpcodeMatcher() override;
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == SwitchOpcode;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == SwitchOpcode; }
 
   unsigned getNumCases() const { return Cases.size(); }
 
@@ -529,16 +519,15 @@ private:
 class CheckTypeMatcher : public Matcher {
   MVT::SimpleValueType Type;
   unsigned ResNo;
+
 public:
   CheckTypeMatcher(MVT::SimpleValueType type, unsigned resno)
-    : Matcher(CheckType), Type(type), ResNo(resno) {}
+      : Matcher(CheckType), Type(type), ResNo(resno) {}
 
   MVT::SimpleValueType getType() const { return Type; }
   unsigned getResNo() const { return ResNo; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckType;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckType; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -553,16 +542,15 @@ private:
 /// then the match fails.  This is semantically equivalent to a Scope node where
 /// every child does a CheckType, but is much faster.
 class SwitchTypeMatcher : public Matcher {
-  SmallVector<std::pair<MVT::SimpleValueType, Matcher*>, 8> Cases;
+  SmallVector<std::pair<MVT::SimpleValueType, Matcher *>, 8> Cases;
+
 public:
   SwitchTypeMatcher(
       SmallVectorImpl<std::pair<MVT::SimpleValueType, Matcher *>> &&cases)
       : Matcher(SwitchType), Cases(std::move(cases)) {}
   ~SwitchTypeMatcher() override;
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == SwitchType;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == SwitchType; }
 
   unsigned getNumCases() const { return Cases.size(); }
 
@@ -575,15 +563,15 @@ private:
   bool isEqualImpl(const Matcher *M) const override { return false; }
 };
 
-
 /// CheckChildTypeMatcher - This checks to see if a child node has the
 /// specified type, if not it fails to match.
 class CheckChildTypeMatcher : public Matcher {
   unsigned ChildNo;
   MVT::SimpleValueType Type;
+
 public:
   CheckChildTypeMatcher(unsigned childno, MVT::SimpleValueType type)
-    : Matcher(CheckChildType), ChildNo(childno), Type(type) {}
+      : Matcher(CheckChildType), ChildNo(childno), Type(type) {}
 
   unsigned getChildNo() const { return ChildNo; }
   MVT::SimpleValueType getType() const { return Type; }
@@ -601,20 +589,17 @@ private:
   bool isContradictoryImpl(const Matcher *M) const override;
 };
 
-
 /// CheckIntegerMatcher - This checks to see if the current node is a
 /// ConstantSDNode with the specified integer value, if not it fails to match.
 class CheckIntegerMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckIntegerMatcher(int64_t value)
-    : Matcher(CheckInteger), Value(value) {}
+  CheckIntegerMatcher(int64_t value) : Matcher(CheckInteger), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckInteger;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckInteger; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -629,9 +614,10 @@ private:
 class CheckChildIntegerMatcher : public Matcher {
   unsigned ChildNo;
   int64_t Value;
+
 public:
   CheckChildIntegerMatcher(unsigned childno, int64_t value)
-    : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {}
+      : Matcher(CheckChildInteger), ChildNo(childno), Value(value) {}
 
   unsigned getChildNo() const { return ChildNo; }
   int64_t getValue() const { return Value; }
@@ -653,9 +639,10 @@ private:
 /// CondCodeSDNode with the specified condition, if not it fails to match.
 class CheckCondCodeMatcher : public Matcher {
   StringRef CondCodeName;
+
 public:
   CheckCondCodeMatcher(StringRef condcodename)
-    : Matcher(CheckCondCode), CondCodeName(condcodename) {}
+      : Matcher(CheckCondCode), CondCodeName(condcodename) {}
 
   StringRef getCondCodeName() const { return CondCodeName; }
 
@@ -675,9 +662,10 @@ private:
 /// CondCodeSDNode with the specified condition, if not it fails to match.
 class CheckChild2CondCodeMatcher : public Matcher {
   StringRef CondCodeName;
+
 public:
   CheckChild2CondCodeMatcher(StringRef condcodename)
-    : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {}
+      : Matcher(CheckChild2CondCode), CondCodeName(condcodename) {}
 
   StringRef getCondCodeName() const { return CondCodeName; }
 
@@ -697,9 +685,10 @@ private:
 /// VTSDNode with the specified type, if not it fails to match.
 class CheckValueTypeMatcher : public Matcher {
   StringRef TypeName;
+
 public:
   CheckValueTypeMatcher(StringRef type_name)
-    : Matcher(CheckValueType), TypeName(type_name) {}
+      : Matcher(CheckValueType), TypeName(type_name) {}
 
   StringRef getTypeName() const { return TypeName; }
 
@@ -715,8 +704,6 @@ private:
   bool isContradictoryImpl(const Matcher *M) const override;
 };
 
-
-
 /// CheckComplexPatMatcher - This node runs the specified ComplexPattern on
 /// the current node.
 class CheckComplexPatMatcher : public Matcher {
@@ -732,11 +719,12 @@ class CheckComplexPatMatcher : public Matcher {
   /// FirstResult - This is the first slot in the RecordedNodes list that the
   /// result of the match populates.
   unsigned FirstResult;
+
 public:
   CheckComplexPatMatcher(const ComplexPattern &pattern, unsigned matchnumber,
                          const std::string &name, unsigned firstresult)
-    : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber),
-      Name(name), FirstResult(firstresult) {}
+      : Matcher(CheckComplexPat), Pattern(pattern), MatchNumber(matchnumber),
+        Name(name), FirstResult(firstresult) {}
 
   const ComplexPattern &getPattern() const { return Pattern; }
   unsigned getMatchNumber() const { return MatchNumber; }
@@ -760,15 +748,13 @@ private:
 /// with something equivalent to the specified immediate.
 class CheckAndImmMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckAndImmMatcher(int64_t value)
-    : Matcher(CheckAndImm), Value(value) {}
+  CheckAndImmMatcher(int64_t value) : Matcher(CheckAndImm), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckAndImm;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckAndImm; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -781,15 +767,13 @@ private:
 /// with something equivalent to the specified immediate.
 class CheckOrImmMatcher : public Matcher {
   int64_t Value;
+
 public:
-  CheckOrImmMatcher(int64_t value)
-    : Matcher(CheckOrImm), Value(value) {}
+  CheckOrImmMatcher(int64_t value) : Matcher(CheckOrImm), Value(value) {}
 
   int64_t getValue() const { return Value; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == CheckOrImm;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == CheckOrImm; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -834,8 +818,7 @@ private:
 /// (which defines a chain operand) is safe to fold into a larger pattern.
 class CheckFoldableChainNodeMatcher : public Matcher {
 public:
-  CheckFoldableChainNodeMatcher()
-    : Matcher(CheckFoldableChainNode) {}
+  CheckFoldableChainNodeMatcher() : Matcher(CheckFoldableChainNode) {}
 
   static bool classof(const Matcher *N) {
     return N->getKind() == CheckFoldableChainNode;
@@ -850,16 +833,15 @@ private:
 class EmitIntegerMatcher : public Matcher {
   int64_t Val;
   MVT::SimpleValueType VT;
+
 public:
   EmitIntegerMatcher(int64_t val, MVT::SimpleValueType vt)
-    : Matcher(EmitInteger), Val(val), VT(vt) {}
+      : Matcher(EmitInteger), Val(val), VT(vt) {}
 
   int64_t getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitInteger;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == EmitInteger; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -874,9 +856,10 @@ private:
 class EmitStringIntegerMatcher : public Matcher {
   std::string Val;
   MVT::SimpleValueType VT;
+
 public:
   EmitStringIntegerMatcher(const std::string &val, MVT::SimpleValueType vt)
-    : Matcher(EmitStringInteger), Val(val), VT(vt) {}
+      : Matcher(EmitStringInteger), Val(val), VT(vt) {}
 
   const std::string &getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
@@ -899,16 +882,15 @@ class EmitRegisterMatcher : public Matcher {
   /// this is a reference to zero_reg.
   const CodeGenRegister *Reg;
   MVT::SimpleValueType VT;
+
 public:
   EmitRegisterMatcher(const CodeGenRegister *reg, MVT::SimpleValueType vt)
-    : Matcher(EmitRegister), Reg(reg), VT(vt) {}
+      : Matcher(EmitRegister), Reg(reg), VT(vt) {}
 
   const CodeGenRegister *getReg() const { return Reg; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitRegister;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == EmitRegister; }
 
 private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
@@ -923,9 +905,10 @@ private:
 /// ISD::TargetConstant, likewise for ConstantFP.
 class EmitConvertToTargetMatcher : public Matcher {
   unsigned Slot;
+
 public:
   EmitConvertToTargetMatcher(unsigned slot)
-    : Matcher(EmitConvertToTarget), Slot(slot) {}
+      : Matcher(EmitConvertToTarget), Slot(slot) {}
 
   unsigned getSlot() const { return Slot; }
 
@@ -946,9 +929,10 @@ private:
 /// chains of these nodes if they are not themselves a node in the pattern.
 class EmitMergeInputChainsMatcher : public Matcher {
   SmallVector<unsigned, 3> ChainNodes;
+
 public:
   EmitMergeInputChainsMatcher(ArrayRef<unsigned> nodes)
-    : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {}
+      : Matcher(EmitMergeInputChains), ChainNodes(nodes.begin(), nodes.end()) {}
 
   unsigned getNumNodes() const { return ChainNodes.size(); }
 
@@ -976,9 +960,8 @@ class EmitCopyToRegMatcher : public Matcher {
   const CodeGenRegister *DestPhysReg;
 
 public:
-  EmitCopyToRegMatcher(unsigned srcSlot,
-                       const CodeGenRegister *destPhysReg)
-    : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {}
+  EmitCopyToRegMatcher(unsigned srcSlot, const CodeGenRegister *destPhysReg)
+      : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {}
 
   unsigned getSrcSlot() const { return SrcSlot; }
   const CodeGenRegister *getDestPhysReg() const { return DestPhysReg; }
@@ -995,16 +978,15 @@ private:
   }
 };
 
-
-
 /// EmitNodeXFormMatcher - Emit an operation that runs an SDNodeXForm on a
 /// recorded node and records the result.
 class EmitNodeXFormMatcher : public Matcher {
   unsigned Slot;
   Record *NodeXForm;
+
 public:
   EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm)
-    : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
+      : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
 
   unsigned getSlot() const { return Slot; }
   Record *getNodeXForm() const { return NodeXForm; }
@@ -1033,6 +1015,7 @@ class EmitNodeMatcherCommon : public Matcher {
   /// If this is a varidic node, this is set to the number of fixed arity
   /// operands in the root of the pattern.  The rest are appended to this node.
   int NumFixedArityOperands;
+
 public:
   EmitNodeMatcherCommon(const CodeGenInstruction &cgi,
                         ArrayRef<MVT::SimpleValueType> vts,
@@ -1061,7 +1044,6 @@ public:
   const SmallVectorImpl<MVT::SimpleValueType> &getVTList() const { return VTs; }
   const SmallVectorImpl<unsigned> &getOperandList() const { return Operands; }
 
-
   bool hasChain() const { return HasChain; }
   bool hasInGlue() const { return HasInGlue; }
   bool hasOutGlue() const { return HasOutGlue; }
@@ -1081,6 +1063,7 @@ private:
 class EmitNodeMatcher : public EmitNodeMatcherCommon {
   void anchor() override;
   unsigned FirstResultSlot;
+
 public:
   EmitNodeMatcher(const CodeGenInstruction &cgi,
                   ArrayRef<MVT::SimpleValueType> vts,
@@ -1094,15 +1077,13 @@ public:
 
   unsigned getFirstResultSlot() const { return FirstResultSlot; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == EmitNode;
-  }
-
+  static bool classof(const Matcher *N) { return N->getKind() == EmitNode; }
 };
 
 class MorphNodeToMatcher : public EmitNodeMatcherCommon {
   void anchor() override;
   const PatternToMatch &Pattern;
+
 public:
   MorphNodeToMatcher(const CodeGenInstruction &cgi,
                      ArrayRef<MVT::SimpleValueType> vts,
@@ -1116,9 +1097,7 @@ public:
 
   const PatternToMatch &getPattern() const { return Pattern; }
 
-  static bool classof(const Matcher *N) {
-    return N->getKind() == MorphNodeTo;
-  }
+  static bool classof(const Matcher *N) { return N->getKind() == MorphNodeTo; }
 };
 
 /// CompleteMatchMatcher - Complete a match by replacing the results of the
@@ -1127,11 +1106,12 @@ public:
 class CompleteMatchMatcher : public Matcher {
   SmallVector<unsigned, 2> Results;
   const PatternToMatch &Pattern;
+
 public:
   CompleteMatchMatcher(ArrayRef<unsigned> results,
                        const PatternToMatch &pattern)
-  : Matcher(CompleteMatch), Results(results.begin(), results.end()),
-    Pattern(pattern) {}
+      : Matcher(CompleteMatch), Results(results.begin(), results.end()),
+        Pattern(pattern) {}
 
   unsigned getNumResults() const { return Results.size(); }
   unsigned getResult(unsigned R) const { return Results[R]; }
@@ -1145,7 +1125,7 @@ private:
   void printImpl(raw_ostream &OS, unsigned indent) const override;
   bool isEqualImpl(const Matcher *M) const override {
     return cast<CompleteMatchMatcher>(M)->Results == Results &&
-          &cast<CompleteMatchMatcher>(M)->Pattern == &Pattern;
+           &cast<CompleteMatchMatcher>(M)->Pattern == &Pattern;
   }
 };
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 50156d34528c..b475c98d30a6 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -50,7 +50,7 @@ namespace {
 class MatcherTableEmitter {
   const CodeGenDAGPatterns &CGP;
 
-  SmallVector<unsigned, Matcher::HighestKind+1> OpcodeCounts;
+  SmallVector<unsigned, Matcher::HighestKind + 1> OpcodeCounts;
 
   std::vector<TreePattern *> NodePredicates;
   std::vector<TreePattern *> NodePredicatesWithOperands;
@@ -62,14 +62,13 @@ class MatcherTableEmitter {
 
   std::vector<std::string> PatternPredicates;
 
-  std::vector<const ComplexPattern*> ComplexPatterns;
+  std::vector<const ComplexPattern *> ComplexPatterns;
 
-
-  DenseMap<Record*, unsigned> NodeXFormMap;
-  std::vector<Record*> NodeXForms;
+  DenseMap<Record *, unsigned> NodeXFormMap;
+  std::vector<Record *> NodeXForms;
 
   std::vector<std::string> VecIncludeStrings;
-  MapVector<std::string, unsigned, StringMap<unsigned> > VecPatterns;
+  MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
 
   unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) {
     const auto It = VecPatterns.find(P);
@@ -184,8 +183,8 @@ private:
 
   unsigned SizeMatcher(Matcher *N, raw_ostream &OS);
 
-  unsigned EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-                       raw_ostream &OS);
+  unsigned EmitMatcher(const Matcher *N, const unsigned Indent,
+                       unsigned CurrentIdx, raw_ostream &OS);
 
   unsigned getNodePredicate(TreePredicateFn Pred) {
     // We use the first predicate.
@@ -210,28 +209,28 @@ private:
       NodeXForms.push_back(Rec);
       Entry = NodeXForms.size();
     }
-    return Entry-1;
+    return Entry - 1;
   }
-
 };
 } // end anonymous namespace.
 
-static std::string GetPatFromTreePatternNode(const TreePatternNode *N) {
+static std::string GetPatFromTreePatternNode(const TreePatternNode &N) {
   std::string str;
   raw_string_ostream Stream(str);
-  Stream << *N;
+  Stream << N;
   return str;
 }
 
 static unsigned GetVBRSize(unsigned Val) {
-  if (Val <= 127) return 1;
+  if (Val <= 127)
+    return 1;
 
   unsigned NumBytes = 0;
   while (Val >= 128) {
     Val >>= 7;
     ++NumBytes;
   }
-  return NumBytes+1;
+  return NumBytes + 1;
 }
 
 /// EmitVBRValue - Emit the specified value as a VBR, returning the number of
@@ -245,7 +244,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   uint64_t InVal = Val;
   unsigned NumBytes = 0;
   while (Val >= 128) {
-    OS << (Val&127) << "|128,";
+    OS << (Val & 127) << "|128,";
     Val >>= 7;
     ++NumBytes;
   }
@@ -253,7 +252,7 @@ static unsigned EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   if (!OmitComments)
     OS << "/*" << InVal << "*/";
   OS << ", ";
-  return NumBytes+1;
+  return NumBytes + 1;
 }
 
 /// Emit the specified signed value as a VBR. To improve compression we encode
@@ -290,8 +289,7 @@ static std::string getIncludePath(const Record *R) {
 
 /// This function traverses the matcher tree and sizes all the nodes
 /// that are children of the three kinds of nodes that have them.
-unsigned MatcherTableEmitter::
-SizeMatcherList(Matcher *N, raw_ostream &OS) {
+unsigned MatcherTableEmitter::SizeMatcherList(Matcher *N, raw_ostream &OS) {
   unsigned Size = 0;
   while (N) {
     Size += SizeMatcher(N, OS);
@@ -303,8 +301,7 @@ SizeMatcherList(Matcher *N, raw_ostream &OS) {
 /// This function sizes the children of the three kinds of nodes that
 /// have them. It does so by using special cases for those three
 /// nodes, but sharing the code in EmitMatcher() for the other kinds.
-unsigned MatcherTableEmitter::
-SizeMatcher(Matcher *N, raw_ostream &OS) {
+unsigned MatcherTableEmitter::SizeMatcher(Matcher *N, raw_ostream &OS) {
   unsigned Idx = 0;
 
   ++OpcodeCounts[N->getKind()];
@@ -389,7 +386,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
          "The sizes of Pattern and include vectors should be the same");
 
   BeginEmitFunction(OS, "StringRef", "getPatternForIndex(unsigned Index)",
-                    true/*AddOverride*/);
+                    true /*AddOverride*/);
   OS << "{\n";
   OS << "static const char *PATTERN_MATCH_TABLE[] = {\n";
 
@@ -403,7 +400,7 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
   EndEmitFunction(OS);
 
   BeginEmitFunction(OS, "StringRef", "getIncludePathForIndex(unsigned Index)",
-                    true/*AddOverride*/);
+                    true /*AddOverride*/);
   OS << "{\n";
   OS << "static const char *INCLUDE_PATH_TABLE[] = {\n";
 
@@ -419,9 +416,10 @@ void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
 
 /// EmitMatcher - Emit bytes for the specified matcher and return
 /// the number of bytes emitted.
-unsigned MatcherTableEmitter::
-EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-            raw_ostream &OS) {
+unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
+                                          const unsigned Indent,
+                                          unsigned CurrentIdx,
+                                          raw_ostream &OS) {
   OS.indent(Indent);
 
   switch (N->getKind()) {
@@ -434,7 +432,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       if (i == 0) {
         OS << "OPC_Scope, ";
         ++CurrentIdx;
-      } else  {
+      } else {
         if (!OmitComments) {
           OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
           OS.indent(Indent) << "/*Scope*/ ";
@@ -451,7 +449,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       }
       OS << '\n';
 
-      ChildSize = EmitMatcherList(SM->getChild(i), Indent+1,
+      ChildSize = EmitMatcherList(SM->getChild(i), Indent + 1,
                                   CurrentIdx + VBRSize, OS);
       assert(ChildSize == SM->getChild(i)->getSize() &&
              "Emitted child size does not match calculated size");
@@ -471,18 +469,15 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
   case Matcher::RecordNode:
     OS << "OPC_RecordNode,";
     if (!OmitComments)
-      OS << " // #"
-         << cast<RecordMatcher>(N)->getResultNo() << " = "
+      OS << " // #" << cast<RecordMatcher>(N)->getResultNo() << " = "
          << cast<RecordMatcher>(N)->getWhatFor();
     OS << '\n';
     return 1;
 
   case Matcher::RecordChild:
-    OS << "OPC_RecordChild" << cast<RecordChildMatcher>(N)->getChildNo()
-       << ',';
+    OS << "OPC_RecordChild" << cast<RecordChildMatcher>(N)->getChildNo() << ',';
     if (!OmitComments)
-      OS << " // #"
-         << cast<RecordChildMatcher>(N)->getResultNo() << " = "
+      OS << " // #" << cast<RecordChildMatcher>(N)->getResultNo() << " = "
          << cast<RecordChildMatcher>(N)->getWhatFor();
     OS << '\n';
     return 1;
@@ -522,14 +517,13 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     return 1;
 
   case Matcher::CheckSame:
-    OS << "OPC_CheckSame, "
-       << cast<CheckSameMatcher>(N)->getMatchNumber() << ",\n";
+    OS << "OPC_CheckSame, " << cast<CheckSameMatcher>(N)->getMatchNumber()
+       << ",\n";
     return 2;
 
   case Matcher::CheckChildSame:
-    OS << "OPC_CheckChild"
-       << cast<CheckChildSameMatcher>(N)->getChildNo() << "Same, "
-       << cast<CheckChildSameMatcher>(N)->getMatchNumber() << ",\n";
+    OS << "OPC_CheckChild" << cast<CheckChildSameMatcher>(N)->getChildNo()
+       << "Same, " << cast<CheckChildSameMatcher>(N)->getMatchNumber() << ",\n";
     return 2;
 
   case Matcher::CheckPatternPredicate: {
@@ -602,10 +596,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       unsigned IdxSize;
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N)) {
         Child = SOM->getCaseMatcher(i);
-        IdxSize = 2;  // size of opcode in table is 2 bytes.
+        IdxSize = 2; // size of opcode in table is 2 bytes.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        IdxSize = 1;  // size of type in table is 1 byte.
+        IdxSize = 1; // size of type in table is 1 byte.
       }
 
       if (i != 0) {
@@ -613,8 +607,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
           OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
         OS.indent(Indent);
         if (!OmitComments)
-          OS << (isa<SwitchOpcodeMatcher>(N) ?
-                     "/*SwitchOpcode*/ " : "/*SwitchType*/ ");
+          OS << (isa<SwitchOpcodeMatcher>(N) ? "/*SwitchOpcode*/ "
+                                             : "/*SwitchType*/ ");
       }
 
       unsigned ChildSize = Child->getSize();
@@ -627,7 +621,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
         OS << "// ->" << CurrentIdx + ChildSize;
       OS << '\n';
 
-      ChildSize = EmitMatcherList(Child, Indent+1, CurrentIdx, OS);
+      ChildSize = EmitMatcherList(Child, Indent + 1, CurrentIdx, OS);
       assert(ChildSize == Child->getSize() &&
              "Emitted child size does not match calculated size");
       CurrentIdx += ChildSize;
@@ -638,8 +632,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       OS << "/*" << format_decimal(CurrentIdx, IndexWidth) << "*/";
     OS.indent(Indent) << "0,";
     if (!OmitComments)
-      OS << (isa<SwitchOpcodeMatcher>(N) ?
-             " // EndSwitchOpcode" : " // EndSwitchType");
+      OS << (isa<SwitchOpcodeMatcher>(N) ? " // EndSwitchOpcode"
+                                         : " // EndSwitchType");
 
     OS << '\n';
     return CurrentIdx - StartIdx + 1;
@@ -722,7 +716,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       OS << " // " << Pattern.getSelectFunc();
       OS << ":$" << CCPM->getName();
       for (unsigned i = 0, e = Pattern.getNumOperands(); i != e; ++i)
-        OS << " #" << CCPM->getFirstResult()+i;
+        OS << " #" << CCPM->getFirstResult() + i;
 
       if (Pattern.hasProperty(SDNPHasChain))
         OS << " + chain result";
@@ -733,14 +727,16 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
   case Matcher::CheckAndImm: {
     OS << "OPC_CheckAndImm, ";
-    unsigned Bytes=1+EmitVBRValue(cast<CheckAndImmMatcher>(N)->getValue(), OS);
+    unsigned Bytes =
+        1 + EmitVBRValue(cast<CheckAndImmMatcher>(N)->getValue(), OS);
     OS << '\n';
     return Bytes;
   }
 
   case Matcher::CheckOrImm: {
     OS << "OPC_CheckOrImm, ";
-    unsigned Bytes = 1+EmitVBRValue(cast<CheckOrImmMatcher>(N)->getValue(), OS);
+    unsigned Bytes =
+        1 + EmitVBRValue(cast<CheckOrImmMatcher>(N)->getValue(), OS);
     OS << '\n';
     return Bytes;
   }
@@ -843,7 +839,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
   case Matcher::EmitMergeInputChains: {
     const EmitMergeInputChainsMatcher *MN =
-      cast<EmitMergeInputChainsMatcher>(N);
+        cast<EmitMergeInputChainsMatcher>(N);
 
     // Handle the specialized forms OPC_EmitMergeInputChains1_0, 1_1, and 1_2.
     if (MN->getNumNodes() == 1 && MN->getNode(0) < 3) {
@@ -855,7 +851,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     for (unsigned i = 0, e = MN->getNumNodes(); i != e; ++i)
       OS << MN->getNode(i) << ", ";
     OS << '\n';
-    return 2+MN->getNumNodes();
+    return 2 + MN->getNumNodes();
   }
   case Matcher::EmitCopyToReg: {
     const auto *C2RMatcher = cast<EmitCopyToRegMatcher>(N);
@@ -884,8 +880,8 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     OS << "OPC_EmitNodeXForm, " << getNodeXFormID(XF->getNodeXForm()) << ", "
        << XF->getSlot() << ',';
     if (!OmitComments)
-      OS << " // "<<XF->getNodeXForm()->getName();
-    OS <<'\n';
+      OS << " // " << XF->getNodeXForm()->getName();
+    OS << '\n';
     return 3;
   }
 
@@ -955,7 +951,7 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
     }
     OS << ",\n";
 
-    OS.indent(FullIndexWidth + Indent+4);
+    OS.indent(FullIndexWidth + Indent + 4);
     if (!CompressVTs) {
       OS << EN->getNumVTs();
       if (!OmitComments)
@@ -980,17 +976,18 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
           OS << " // Results =";
           unsigned First = E->getFirstResultSlot();
           for (unsigned i = 0; i != NumResults; ++i)
-            OS << " #" << First+i;
+            OS << " #" << First + i;
         }
       }
       OS << '\n';
 
       if (const MorphNodeToMatcher *SNT = dyn_cast<MorphNodeToMatcher>(N)) {
-        OS.indent(FullIndexWidth + Indent) << "// Src: "
-          << *SNT->getPattern().getSrcPattern() << " - Complexity = "
-          << SNT->getPattern().getPatternComplexity(CGP) << '\n';
-        OS.indent(FullIndexWidth + Indent) << "// Dst: "
-          << *SNT->getPattern().getDstPattern() << '\n';
+        OS.indent(FullIndexWidth + Indent)
+            << "// Src: " << SNT->getPattern().getSrcPattern()
+            << " - Complexity = " << SNT->getPattern().getPatternComplexity(CGP)
+            << '\n';
+        OS.indent(FullIndexWidth + Indent)
+            << "// Dst: " << SNT->getPattern().getDstPattern() << '\n';
       }
     } else
       OS << '\n';
@@ -1021,11 +1018,12 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
       NumResultBytes += EmitVBRValue(CM->getResult(i), OS);
     OS << '\n';
     if (!OmitComments) {
-      OS.indent(FullIndexWidth + Indent) << " // Src: "
-        << *CM->getPattern().getSrcPattern() << " - Complexity = "
-        << CM->getPattern().getPatternComplexity(CGP) << '\n';
-      OS.indent(FullIndexWidth + Indent) << " // Dst: "
-        << *CM->getPattern().getDstPattern();
+      OS.indent(FullIndexWidth + Indent)
+          << " // Src: " << CM->getPattern().getSrcPattern()
+          << " - Complexity = " << CM->getPattern().getPatternComplexity(CGP)
+          << '\n';
+      OS.indent(FullIndexWidth + Indent)
+          << " // Dst: " << CM->getPattern().getDstPattern();
     }
     OS << '\n';
     return 2 + NumResultBytes + NumCoveredBytes;
@@ -1036,9 +1034,10 @@ EmitMatcher(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
 
 /// This function traverses the matcher tree and emits all the nodes.
 /// The nodes have already been sized.
-unsigned MatcherTableEmitter::
-EmitMatcherList(const Matcher *N, const unsigned Indent, unsigned CurrentIdx,
-                raw_ostream &OS) {
+unsigned MatcherTableEmitter::EmitMatcherList(const Matcher *N,
+                                              const unsigned Indent,
+                                              unsigned CurrentIdx,
+                                              raw_ostream &OS) {
   unsigned Size = 0;
   while (N) {
     if (!OmitComments)
@@ -1059,7 +1058,7 @@ void MatcherTableEmitter::EmitNodePredicatesFunction(
   if (Preds.empty())
     return;
 
-  BeginEmitFunction(OS, "bool", Decl, true/*AddOverride*/);
+  BeginEmitFunction(OS, "bool", Decl, true /*AddOverride*/);
   OS << "{\n";
   OS << "  switch (PredNo) {\n";
   OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
@@ -1083,12 +1082,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
     BeginEmitFunction(OS, "bool",
-          "CheckPatternPredicate(unsigned PredNo) const", true/*AddOverride*/);
+                      "CheckPatternPredicate(unsigned PredNo) const",
+                      true /*AddOverride*/);
     OS << "{\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = PatternPredicates.size(); i != e; ++i)
-      OS << "  case " << i << ": return "  << PatternPredicates[i] << ";\n";
+      OS << "  case " << i << ": return " << PatternPredicates[i] << ";\n";
     OS << "  }\n";
     OS << "}\n";
     EndEmitFunction(OS);
@@ -1107,11 +1107,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
   // Emit CompletePattern matchers.
   // FIXME: This should be const.
   if (!ComplexPatterns.empty()) {
-    BeginEmitFunction(OS, "bool",
-          "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n"
-          "      SDValue N, unsigned PatternNo,\n"
-          "      SmallVectorImpl<std::pair<SDValue, SDNode *>> &Result)",
-          true/*AddOverride*/);
+    BeginEmitFunction(
+        OS, "bool",
+        "CheckComplexPattern(SDNode *Root, SDNode *Parent,\n"
+        "      SDValue N, unsigned PatternNo,\n"
+        "      SmallVectorImpl<std::pair<SDValue, SDNode *>> &Result)",
+        true /*AddOverride*/);
     OS << "{\n";
     OS << "  unsigned NextRes = Result.size();\n";
     OS << "  switch (PatternNo) {\n";
@@ -1121,7 +1122,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
       unsigned NumOps = P.getNumOperands();
 
       if (P.hasProperty(SDNPHasChain))
-        ++NumOps;  // Get the chained node too.
+        ++NumOps; // Get the chained node too.
 
       OS << "  case " << i << ":\n";
       if (InstrumentCoverage)
@@ -1160,12 +1161,12 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
     EndEmitFunction(OS);
   }
 
-
   // Emit SDNodeXForm handlers.
   // FIXME: This should be const.
   if (!NodeXForms.empty()) {
     BeginEmitFunction(OS, "SDValue",
-          "RunSDNodeXForm(SDValue V, unsigned XFormNo)", true/*AddOverride*/);
+                      "RunSDNodeXForm(SDValue V, unsigned XFormNo)",
+                      true /*AddOverride*/);
     OS << "{\n";
     OS << "  switch (XFormNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid xform # in table?\");\n";
@@ -1173,7 +1174,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(raw_ostream &OS) {
     // FIXME: The node xform could take SDValue's instead of SDNode*'s.
     for (unsigned i = 0, e = NodeXForms.size(); i != e; ++i) {
       const CodeGenDAGPatterns::NodeXForm &Entry =
-        CGP.getSDNodeTransform(NodeXForms[i]);
+          CGP.getSDNodeTransform(NodeXForms[i]);
 
       Record *SDNode = Entry.first;
       const std::string &Code = Entry.second;
@@ -1281,8 +1282,7 @@ static StringRef getOpcodeString(Matcher::KindTy Kind) {
   llvm_unreachable("Unhandled opcode?");
 }
 
-void MatcherTableEmitter::EmitHistogram(const Matcher *M,
-                                        raw_ostream &OS) {
+void MatcherTableEmitter::EmitHistogram(const Matcher *M, raw_ostream &OS) {
   if (OmitComments)
     return;
 
@@ -1295,9 +1295,7 @@ void MatcherTableEmitter::EmitHistogram(const Matcher *M,
   OS << '\n';
 }
 
-
-void llvm::EmitMatcherTable(Matcher *TheMatcher,
-                            const CodeGenDAGPatterns &CGP,
+void llvm::EmitMatcherTable(Matcher *TheMatcher, const CodeGenDAGPatterns &CGP,
                             raw_ostream &OS) {
   OS << "#if defined(GET_DAGISEL_DECL) && defined(GET_DAGISEL_BODY)\n";
   OS << "#error GET_DAGISEL_DECL and GET_DAGISEL_BODY cannot be both defined, ";
@@ -1328,7 +1326,7 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher,
   OS << "#define DAGISEL_CLASS_COLONCOLON\n";
   OS << "#endif\n\n";
 
-  BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false/*AddOverride*/);
+  BeginEmitFunction(OS, "void", "SelectCode(SDNode *N)", false /*AddOverride*/);
   MatcherTableEmitter MatcherEmitter(TheMatcher, CGP);
 
   // First we size all the children of the three kinds of matchers that have
@@ -1348,7 +1346,8 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher,
   OS << "  #define TARGET_VAL(X) X & 255, unsigned(X) >> 8\n";
   OS << "  static const unsigned char MatcherTable[] = {\n";
   TotalSize = MatcherEmitter.EmitMatcherList(TheMatcher, 1, 0, OS);
-  OS << "    0\n  }; // Total Array size is " << (TotalSize+1) << " bytes\n\n";
+  OS << "    0\n  }; // Total Array size is " << (TotalSize + 1)
+     << " bytes\n\n";
 
   MatcherEmitter.EmitHistogram(TheMatcher, OS);
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 3526e97c8e08..956cb5e4a65a 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -20,7 +20,6 @@
 #include <utility>
 using namespace llvm;
 
-
 /// getRegisterValueType - Look up and return the ValueType of the specified
 /// register. If the register is a member of multiple register classes, they
 /// must all have the same type.
@@ -52,96 +51,97 @@ static MVT::SimpleValueType getRegisterValueType(Record *R,
   return VT;
 }
 
-
 namespace {
-  class MatcherGen {
-    const PatternToMatch &Pattern;
-    const CodeGenDAGPatterns &CGP;
-
-    /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts
-    /// out with all of the types removed.  This allows us to insert type checks
-    /// as we scan the tree.
-    TreePatternNodePtr PatWithNoTypes;
-
-    /// VariableMap - A map from variable names ('$dst') to the recorded operand
-    /// number that they were captured as.  These are biased by 1 to make
-    /// insertion easier.
-    StringMap<unsigned> VariableMap;
-
-    /// This maintains the recorded operand number that OPC_CheckComplexPattern
-    /// drops each sub-operand into. We don't want to insert these into
-    /// VariableMap because that leads to identity checking if they are
-    /// encountered multiple times. Biased by 1 like VariableMap for
-    /// consistency.
-    StringMap<unsigned> NamedComplexPatternOperands;
-
-    /// NextRecordedOperandNo - As we emit opcodes to record matched values in
-    /// the RecordedNodes array, this keeps track of which slot will be next to
-    /// record into.
-    unsigned NextRecordedOperandNo;
-
-    /// MatchedChainNodes - This maintains the position in the recorded nodes
-    /// array of all of the recorded input nodes that have chains.
-    SmallVector<unsigned, 2> MatchedChainNodes;
-
-    /// MatchedComplexPatterns - This maintains a list of all of the
-    /// ComplexPatterns that we need to check. The second element of each pair
-    /// is the recorded operand number of the input node.
-    SmallVector<std::pair<const TreePatternNode*,
-                          unsigned>, 2> MatchedComplexPatterns;
-
-    /// PhysRegInputs - List list has an entry for each explicitly specified
-    /// physreg input to the pattern.  The first elt is the Register node, the
-    /// second is the recorded slot number the input pattern match saved it in.
-    SmallVector<std::pair<Record*, unsigned>, 2> PhysRegInputs;
-
-    /// Matcher - This is the top level of the generated matcher, the result.
-    Matcher *TheMatcher;
-
-    /// CurPredicate - As we emit matcher nodes, this points to the latest check
-    /// which should have future checks stuck into its Next position.
-    Matcher *CurPredicate;
-  public:
-    MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
-
-    bool EmitMatcherCode(unsigned Variant);
-    void EmitResultCode();
-
-    Matcher *GetMatcher() const { return TheMatcher; }
-  private:
-    void AddMatcher(Matcher *NewNode);
-    void InferPossibleTypes();
-
-    // Matcher Generation.
-    void EmitMatchCode(const TreePatternNode *N, TreePatternNode *NodeNoTypes);
-    void EmitLeafMatchCode(const TreePatternNode *N);
-    void EmitOperatorMatchCode(const TreePatternNode *N,
-                               TreePatternNode *NodeNoTypes);
-
-    /// If this is the first time a node with unique identifier Name has been
-    /// seen, record it. Otherwise, emit a check to make sure this is the same
-    /// node. Returns true if this is the first encounter.
-    bool recordUniqueNode(ArrayRef<std::string> Names);
-
-    // Result Code Generation.
-    unsigned getNamedArgumentSlot(StringRef Name) {
-      unsigned VarMapEntry = VariableMap[Name];
-      assert(VarMapEntry != 0 &&
-             "Variable referenced but not defined and not caught earlier!");
-      return VarMapEntry-1;
-    }
+class MatcherGen {
+  const PatternToMatch &Pattern;
+  const CodeGenDAGPatterns &CGP;
+
+  /// PatWithNoTypes - This is a clone of Pattern.getSrcPattern() that starts
+  /// out with all of the types removed.  This allows us to insert type checks
+  /// as we scan the tree.
+  TreePatternNodePtr PatWithNoTypes;
+
+  /// VariableMap - A map from variable names ('$dst') to the recorded operand
+  /// number that they were captured as.  These are biased by 1 to make
+  /// insertion easier.
+  StringMap<unsigned> VariableMap;
+
+  /// This maintains the recorded operand number that OPC_CheckComplexPattern
+  /// drops each sub-operand into. We don't want to insert these into
+  /// VariableMap because that leads to identity checking if they are
+  /// encountered multiple times. Biased by 1 like VariableMap for
+  /// consistency.
+  StringMap<unsigned> NamedComplexPatternOperands;
+
+  /// NextRecordedOperandNo - As we emit opcodes to record matched values in
+  /// the RecordedNodes array, this keeps track of which slot will be next to
+  /// record into.
+  unsigned NextRecordedOperandNo;
+
+  /// MatchedChainNodes - This maintains the position in the recorded nodes
+  /// array of all of the recorded input nodes that have chains.
+  SmallVector<unsigned, 2> MatchedChainNodes;
+
+  /// MatchedComplexPatterns - This maintains a list of all of the
+  /// ComplexPatterns that we need to check. The second element of each pair
+  /// is the recorded operand number of the input node.
+  SmallVector<std::pair<const TreePatternNode *, unsigned>, 2>
+      MatchedComplexPatterns;
+
+  /// PhysRegInputs - List list has an entry for each explicitly specified
+  /// physreg input to the pattern.  The first elt is the Register node, the
+  /// second is the recorded slot number the input pattern match saved it in.
+  SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
+
+  /// Matcher - This is the top level of the generated matcher, the result.
+  Matcher *TheMatcher;
+
+  /// CurPredicate - As we emit matcher nodes, this points to the latest check
+  /// which should have future checks stuck into its Next position.
+  Matcher *CurPredicate;
+
+public:
+  MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
+
+  bool EmitMatcherCode(unsigned Variant);
+  void EmitResultCode();
+
+  Matcher *GetMatcher() const { return TheMatcher; }
+
+private:
+  void AddMatcher(Matcher *NewNode);
+  void InferPossibleTypes();
+
+  // Matcher Generation.
+  void EmitMatchCode(const TreePatternNode &N, TreePatternNode &NodeNoTypes);
+  void EmitLeafMatchCode(const TreePatternNode &N);
+  void EmitOperatorMatchCode(const TreePatternNode &N,
+                             TreePatternNode &NodeNoTypes);
+
+  /// If this is the first time a node with unique identifier Name has been
+  /// seen, record it. Otherwise, emit a check to make sure this is the same
+  /// node. Returns true if this is the first encounter.
+  bool recordUniqueNode(ArrayRef<std::string> Names);
+
+  // Result Code Generation.
+  unsigned getNamedArgumentSlot(StringRef Name) {
+    unsigned VarMapEntry = VariableMap[Name];
+    assert(VarMapEntry != 0 &&
+           "Variable referenced but not defined and not caught earlier!");
+    return VarMapEntry - 1;
+  }
 
-    void EmitResultOperand(const TreePatternNode *N,
-                           SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultOfNamedOperand(const TreePatternNode *N,
-                                  SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultLeafAsOperand(const TreePatternNode *N,
-                                 SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultInstructionAsOperand(const TreePatternNode *N,
-                                        SmallVectorImpl<unsigned> &ResultOps);
-    void EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
-                                        SmallVectorImpl<unsigned> &ResultOps);
-    };
+  void EmitResultOperand(const TreePatternNode &N,
+                         SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultOfNamedOperand(const TreePatternNode &N,
+                                SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultLeafAsOperand(const TreePatternNode &N,
+                               SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultInstructionAsOperand(const TreePatternNode &N,
+                                      SmallVectorImpl<unsigned> &ResultOps);
+  void EmitResultSDNodeXFormAsOperand(const TreePatternNode &N,
+                                      SmallVectorImpl<unsigned> &ResultOps);
+};
 
 } // end anonymous namespace
 
@@ -162,7 +162,7 @@ MatcherGen::MatcherGen(const PatternToMatch &pattern,
   // apply the type to the tree, then rerun type inference.  Iterate until all
   // types are resolved.
   //
-  PatWithNoTypes = Pattern.getSrcPattern()->clone();
+  PatWithNoTypes = Pattern.getSrcPattern().clone();
   PatWithNoTypes->RemoveAllTypes();
 
   // If there are types that are manifestly known, infer them.
@@ -180,11 +180,10 @@ void MatcherGen::InferPossibleTypes() {
 
   bool MadeChange = true;
   while (MadeChange)
-    MadeChange = PatWithNoTypes->ApplyTypeConstraints(TP,
-                                              true/*Ignore reg constraints*/);
+    MadeChange = PatWithNoTypes->ApplyTypeConstraints(
+        TP, true /*Ignore reg constraints*/);
 }
 
-
 /// AddMatcher - Add a matcher node to the current graph we're building.
 void MatcherGen::AddMatcher(Matcher *NewNode) {
   if (CurPredicate)
@@ -194,21 +193,20 @@ void MatcherGen::AddMatcher(Matcher *NewNode) {
   CurPredicate = NewNode;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pattern Match Generation
 //===----------------------------------------------------------------------===//
 
 /// EmitLeafMatchCode - Generate matching code for leaf nodes.
-void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
-  assert(N->isLeaf() && "Not a leaf?");
+void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) {
+  assert(N.isLeaf() && "Not a leaf?");
 
   // Direct match against an integer constant.
-  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
+  if (IntInit *II = dyn_cast<IntInit>(N.getLeafValue())) {
     // If this is the root of the dag we're matching, we emit a redundant opcode
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
-    if (N == Pattern.getSrcPattern()) {
+    if (&N == &Pattern.getSrcPattern()) {
       const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed("imm"));
       AddMatcher(new CheckOpcodeMatcher(NI));
     }
@@ -217,14 +215,14 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   }
 
   // An UnsetInit represents a named node without any constraints.
-  if (isa<UnsetInit>(N->getLeafValue())) {
-    assert(N->hasName() && "Unnamed ? leaf");
+  if (isa<UnsetInit>(N.getLeafValue())) {
+    assert(N.hasName() && "Unnamed ? leaf");
     return;
   }
 
-  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(N.getLeafValue());
   if (!DI) {
-    errs() << "Unknown leaf kind: " << *N << "\n";
+    errs() << "Unknown leaf kind: " << N << "\n";
     abort();
   }
 
@@ -234,13 +232,13 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   // unnamed.
   if (LeafRec->isSubClassOf("ValueType")) {
     // A named ValueType leaf always matches: (add i32:$a, i32:$b).
-    if (N->hasName())
+    if (N.hasName())
       return;
     // An unnamed ValueType as in (sext_inreg GPR:$foo, i8).
     return AddMatcher(new CheckValueTypeMatcher(LeafRec->getName()));
   }
 
-  if (// Handle register references.  Nothing to do here, they always match.
+  if ( // Handle register references.  Nothing to do here, they always match.
       LeafRec->isSubClassOf("RegisterClass") ||
       LeafRec->isSubClassOf("RegisterOperand") ||
       LeafRec->isSubClassOf("PointerLikeRegClass") ||
@@ -252,7 +250,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   // If we have a physreg reference like (mul gpr:$src, EAX) then we need to
   // record the register
   if (LeafRec->isSubClassOf("Register")) {
-    AddMatcher(new RecordMatcher("physreg input "+LeafRec->getName().str(),
+    AddMatcher(new RecordMatcher("physreg input " + LeafRec->getName().str(),
                                  NextRecordedOperandNo));
     PhysRegInputs.push_back(std::make_pair(LeafRec, NextRecordedOperandNo++));
     return;
@@ -264,17 +262,17 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   if (LeafRec->isSubClassOf("ComplexPattern")) {
     // We can't model ComplexPattern uses that don't have their name taken yet.
     // The OPC_CheckComplexPattern operation implicitly records the results.
-    if (N->getName().empty()) {
+    if (N.getName().empty()) {
       std::string S;
       raw_string_ostream OS(S);
-      OS << "We expect complex pattern uses to have names: " << *N;
+      OS << "We expect complex pattern uses to have names: " << N;
       PrintFatalError(S);
     }
 
     // Remember this ComplexPattern so that we can emit it after all the other
     // structural matches are done.
-    unsigned InputOperand = VariableMap[N->getName()] - 1;
-    MatchedComplexPatterns.push_back(std::make_pair(N, InputOperand));
+    unsigned InputOperand = VariableMap[N.getName()] - 1;
+    MatchedComplexPatterns.push_back(std::make_pair(&N, InputOperand));
     return;
   }
 
@@ -283,8 +281,8 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     // If this is the root of the dag we're matching, we emit a redundant opcode
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
-    if (N == Pattern.getSrcPattern()) {
-      MVT VT = N->getSimpleType(0);
+    if (&N == &Pattern.getSrcPattern()) {
+      MVT VT = N.getSimpleType(0);
       StringRef Name = VT.isScalableVector() ? "splat_vector" : "build_vector";
       const SDNodeInfo &NI = CGP.getSDNodeInfo(CGP.getSDNodeNamed(Name));
       AddMatcher(new CheckOpcodeMatcher(NI));
@@ -296,33 +294,33 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     return;
   }
 
-  errs() << "Unknown leaf kind: " << *N << "\n";
+  errs() << "Unknown leaf kind: " << N << "\n";
   abort();
 }
 
-void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
-                                       TreePatternNode *NodeNoTypes) {
-  assert(!N->isLeaf() && "Not an operator?");
+void MatcherGen::EmitOperatorMatchCode(const TreePatternNode &N,
+                                       TreePatternNode &NodeNoTypes) {
+  assert(!N.isLeaf() && "Not an operator?");
 
-  if (N->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (N.getOperator()->isSubClassOf("ComplexPattern")) {
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2". We should already have validated that the uses are
     // consistent.
-    std::string PatternName = std::string(N->getOperator()->getName());
-    for (unsigned i = 0; i < N->getNumChildren(); ++i) {
+    std::string PatternName = std::string(N.getOperator()->getName());
+    for (unsigned i = 0; i < N.getNumChildren(); ++i) {
       PatternName += ":";
-      PatternName += N->getChild(i)->getName();
+      PatternName += N.getChild(i).getName();
     }
 
     if (recordUniqueNode(PatternName)) {
-      auto NodeAndOpNum = std::make_pair(N, NextRecordedOperandNo - 1);
+      auto NodeAndOpNum = std::make_pair(&N, NextRecordedOperandNo - 1);
       MatchedComplexPatterns.push_back(NodeAndOpNum);
     }
 
     return;
   }
 
-  const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
+  const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N.getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
   // a constant without a predicate fn that has more than one bit set, handle
@@ -334,28 +332,28 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   // them from the mask in the dag.  For example, it might turn 'AND X, 255'
   // into 'AND X, 254' if it knows the low bit is set.  Emit code that checks
   // to handle this.
-  if ((N->getOperator()->getName() == "and" ||
-       N->getOperator()->getName() == "or") &&
-      N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateCalls().empty() &&
-      N->getPredicateCalls().empty()) {
-    if (IntInit *II = dyn_cast<IntInit>(N->getChild(1)->getLeafValue())) {
+  if ((N.getOperator()->getName() == "and" ||
+       N.getOperator()->getName() == "or") &&
+      N.getChild(1).isLeaf() && N.getChild(1).getPredicateCalls().empty() &&
+      N.getPredicateCalls().empty()) {
+    if (IntInit *II = dyn_cast<IntInit>(N.getChild(1).getLeafValue())) {
       if (!llvm::has_single_bit<uint32_t>(
               II->getValue())) { // Don't bother with single bits.
         // If this is at the root of the pattern, we emit a redundant
         // CheckOpcode so that the following checks get factored properly under
         // a single opcode check.
-        if (N == Pattern.getSrcPattern())
+        if (&N == &Pattern.getSrcPattern())
           AddMatcher(new CheckOpcodeMatcher(CInfo));
 
         // Emit the CheckAndImm/CheckOrImm node.
-        if (N->getOperator()->getName() == "and")
+        if (N.getOperator()->getName() == "and")
           AddMatcher(new CheckAndImmMatcher(II->getValue()));
         else
           AddMatcher(new CheckOrImmMatcher(II->getValue()));
 
         // Match the LHS of the AND as appropriate.
         AddMatcher(new MoveChildMatcher(0));
-        EmitMatchCode(N->getChild(0), NodeNoTypes->getChild(0));
+        EmitMatchCode(N.getChild(0), NodeNoTypes.getChild(0));
         AddMatcher(new MoveParentMatcher());
         return;
       }
@@ -367,16 +365,16 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
 
   // If this node has memory references (i.e. is a load or store), tell the
   // interpreter to capture them in the memref array.
-  if (N->NodeHasProperty(SDNPMemOperand, CGP))
+  if (N.NodeHasProperty(SDNPMemOperand, CGP))
     AddMatcher(new RecordMemRefMatcher());
 
   // If this node has a chain, then the chain is operand #0 is the SDNode, and
   // the child numbers of the node are all offset by one.
   unsigned OpNo = 0;
-  if (N->NodeHasProperty(SDNPHasChain, CGP)) {
+  if (N.NodeHasProperty(SDNPHasChain, CGP)) {
     // Record the node and remember it in our chained nodes list.
-    AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
-                                         "' chained node",
+    AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() +
+                                     "' chained node",
                                  NextRecordedOperandNo));
     // Remember all of the input chains our pattern will match.
     MatchedChainNodes.push_back(NextRecordedOperandNo++);
@@ -406,26 +404,24 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
     // To prevent this, we emit a dynamic check for legality before allowing
     // this to be folded.
     //
-    const TreePatternNode *Root = Pattern.getSrcPattern();
-    if (N != Root) {                             // Not the root of the pattern.
+    const TreePatternNode &Root = Pattern.getSrcPattern();
+    if (&N != &Root) { // Not the root of the pattern.
       // If there is a node between the root and this node, then we definitely
       // need to emit the check.
-      bool NeedCheck = !Root->hasChild(N);
+      bool NeedCheck = !Root.hasChild(&N);
 
       // If it *is* an immediate child of the root, we can still need a check if
       // the root SDNode has multiple inputs.  For us, this means that it is an
       // intrinsic, has multiple operands, or has other inputs like chain or
       // glue).
       if (!NeedCheck) {
-        const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root->getOperator());
+        const SDNodeInfo &PInfo = CGP.getSDNodeInfo(Root.getOperator());
         NeedCheck =
-          Root->getOperator() == CGP.get_intrinsic_void_sdnode() ||
-          Root->getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
-          Root->getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
-          PInfo.getNumOperands() > 1 ||
-          PInfo.hasProperty(SDNPHasChain) ||
-          PInfo.hasProperty(SDNPInGlue) ||
-          PInfo.hasProperty(SDNPOptInGlue);
+            Root.getOperator() == CGP.get_intrinsic_void_sdnode() ||
+            Root.getOperator() == CGP.get_intrinsic_w_chain_sdnode() ||
+            Root.getOperator() == CGP.get_intrinsic_wo_chain_sdnode() ||
+            PInfo.getNumOperands() > 1 || PInfo.hasProperty(SDNPHasChain) ||
+            PInfo.hasProperty(SDNPInGlue) || PInfo.hasProperty(SDNPOptInGlue);
       }
 
       if (NeedCheck)
@@ -434,27 +430,26 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   }
 
   // If this node has an output glue and isn't the root, remember it.
-  if (N->NodeHasProperty(SDNPOutGlue, CGP) &&
-      N != Pattern.getSrcPattern()) {
+  if (N.NodeHasProperty(SDNPOutGlue, CGP) && &N != &Pattern.getSrcPattern()) {
     // TODO: This redundantly records nodes with both glues and chains.
 
     // Record the node and remember it in our chained nodes list.
-    AddMatcher(new RecordMatcher("'" + N->getOperator()->getName().str() +
-                                         "' glue output node",
+    AddMatcher(new RecordMatcher("'" + N.getOperator()->getName().str() +
+                                     "' glue output node",
                                  NextRecordedOperandNo));
   }
 
   // If this node is known to have an input glue or if it *might* have an input
   // glue, capture it as the glue input of the pattern.
-  if (N->NodeHasProperty(SDNPOptInGlue, CGP) ||
-      N->NodeHasProperty(SDNPInGlue, CGP))
+  if (N.NodeHasProperty(SDNPOptInGlue, CGP) ||
+      N.NodeHasProperty(SDNPInGlue, CGP))
     AddMatcher(new CaptureGlueInputMatcher());
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i, ++OpNo) {
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i, ++OpNo) {
     // Get the code suitable for matching this child.  Move to the child, check
     // it then move back to the parent.
     AddMatcher(new MoveChildMatcher(OpNo));
-    EmitMatchCode(N->getChild(i), NodeNoTypes->getChild(i));
+    EmitMatchCode(N.getChild(i), NodeNoTypes.getChild(i));
     AddMatcher(new MoveParentMatcher());
   }
 }
@@ -485,7 +480,7 @@ bool MatcherGen::recordUniqueNode(ArrayRef<std::string> Names) {
     // we already have checked that the first reference is valid, we don't
     // have to recursively match it, just check that it's the same as the
     // previously named thing.
-    AddMatcher(new CheckSameMatcher(Entry-1));
+    AddMatcher(new CheckSameMatcher(Entry - 1));
   }
 
   for (const std::string &Name : Names)
@@ -494,16 +489,17 @@ bool MatcherGen::recordUniqueNode(ArrayRef<std::string> Names) {
   return NewRecord;
 }
 
-void MatcherGen::EmitMatchCode(const TreePatternNode *N,
-                               TreePatternNode *NodeNoTypes) {
+void MatcherGen::EmitMatchCode(const TreePatternNode &N,
+                               TreePatternNode &NodeNoTypes) {
   // If N and NodeNoTypes don't agree on a type, then this is a case where we
   // need to do a type check.  Emit the check, apply the type to NodeNoTypes and
   // reinfer any correlated types.
   SmallVector<unsigned, 2> ResultsToTypeCheck;
 
-  for (unsigned i = 0, e = NodeNoTypes->getNumTypes(); i != e; ++i) {
-    if (NodeNoTypes->getExtType(i) == N->getExtType(i)) continue;
-    NodeNoTypes->setType(i, N->getExtType(i));
+  for (unsigned i = 0, e = NodeNoTypes.getNumTypes(); i != e; ++i) {
+    if (NodeNoTypes.getExtType(i) == N.getExtType(i))
+      continue;
+    NodeNoTypes.setType(i, N.getExtType(i));
     InferPossibleTypes();
     ResultsToTypeCheck.push_back(i);
   }
@@ -511,11 +507,12 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   // If this node has a name associated with it, capture it in VariableMap. If
   // we already saw this in the pattern, emit code to verify dagness.
   SmallVector<std::string, 4> Names;
-  if (!N->getName().empty())
-    Names.push_back(N->getName());
+  if (!N.getName().empty())
+    Names.push_back(N.getName());
 
-  for (const ScopedName &Name : N->getNamesAsPredicateArg()) {
-    Names.push_back(("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
+  for (const ScopedName &Name : N.getNamesAsPredicateArg()) {
+    Names.push_back(
+        ("pred:" + Twine(Name.getScope()) + ":" + Name.getIdentifier()).str());
   }
 
   if (!Names.empty()) {
@@ -523,14 +520,14 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
       return;
   }
 
-  if (N->isLeaf())
+  if (N.isLeaf())
     EmitLeafMatchCode(N);
   else
     EmitOperatorMatchCode(N, NodeNoTypes);
 
   // If there are node predicates for this node, generate their checks.
-  for (unsigned i = 0, e = N->getPredicateCalls().size(); i != e; ++i) {
-    const TreePredicateCall &Pred = N->getPredicateCalls()[i];
+  for (unsigned i = 0, e = N.getPredicateCalls().size(); i != e; ++i) {
+    const TreePredicateCall &Pred = N.getPredicateCalls()[i];
     SmallVector<unsigned, 4> Operands;
     if (Pred.Fn.usesOperands()) {
       TreePattern *TP = Pred.Fn.getOrigPatFragRecord();
@@ -544,7 +541,7 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
   }
 
   for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
-    AddMatcher(new CheckTypeMatcher(N->getSimpleType(ResultsToTypeCheck[i]),
+    AddMatcher(new CheckTypeMatcher(N.getSimpleType(ResultsToTypeCheck[i]),
                                     ResultsToTypeCheck[i]));
 }
 
@@ -557,18 +554,21 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // Depending on which variant we're generating code for, emit the root opcode
   // check.
   if (const ComplexPattern *CP =
-                   Pattern.getSrcPattern()->getComplexPatternInfo(CGP)) {
-    const std::vector<Record*> &OpNodes = CP->getRootNodes();
-    assert(!OpNodes.empty() &&"Complex Pattern must specify what it can match");
-    if (Variant >= OpNodes.size()) return true;
+          Pattern.getSrcPattern().getComplexPatternInfo(CGP)) {
+    const std::vector<Record *> &OpNodes = CP->getRootNodes();
+    assert(!OpNodes.empty() &&
+           "Complex Pattern must specify what it can match");
+    if (Variant >= OpNodes.size())
+      return true;
 
     AddMatcher(new CheckOpcodeMatcher(CGP.getSDNodeInfo(OpNodes[Variant])));
   } else {
-    if (Variant != 0) return true;
+    if (Variant != 0)
+      return true;
   }
 
   // Emit the matcher for the pattern structure and types.
-  EmitMatchCode(Pattern.getSrcPattern(), PatWithNoTypes.get());
+  EmitMatchCode(Pattern.getSrcPattern(), *PatWithNoTypes);
 
   // If the pattern has a predicate on it (e.g. only enabled when a subtarget
   // feature is around, do the check).
@@ -581,28 +581,28 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   // because they are generally more expensive to evaluate and more difficult to
   // factor.
   for (unsigned i = 0, e = MatchedComplexPatterns.size(); i != e; ++i) {
-    auto N = MatchedComplexPatterns[i].first;
+    auto &N = *MatchedComplexPatterns[i].first;
 
     // Remember where the results of this match get stuck.
-    if (N->isLeaf()) {
-      NamedComplexPatternOperands[N->getName()] = NextRecordedOperandNo + 1;
+    if (N.isLeaf()) {
+      NamedComplexPatternOperands[N.getName()] = NextRecordedOperandNo + 1;
     } else {
       unsigned CurOp = NextRecordedOperandNo;
-      for (unsigned i = 0; i < N->getNumChildren(); ++i) {
-        NamedComplexPatternOperands[N->getChild(i)->getName()] = CurOp + 1;
-        CurOp += N->getChild(i)->getNumMIResults(CGP);
+      for (unsigned i = 0; i < N.getNumChildren(); ++i) {
+        NamedComplexPatternOperands[N.getChild(i).getName()] = CurOp + 1;
+        CurOp += N.getChild(i).getNumMIResults(CGP);
       }
     }
 
     // Get the slot we recorded the value in from the name on the node.
     unsigned RecNodeEntry = MatchedComplexPatterns[i].second;
 
-    const ComplexPattern *CP = N->getComplexPatternInfo(CGP);
+    const ComplexPattern *CP = N.getComplexPatternInfo(CGP);
     assert(CP && "Not a valid ComplexPattern!");
 
     // Emit a CheckComplexPat operation, which does the match (aborting if it
     // fails) and pushes the matched operands onto the recorded nodes list.
-    AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N->getName(),
+    AddMatcher(new CheckComplexPatMatcher(*CP, RecNodeEntry, N.getName(),
                                           NextRecordedOperandNo));
 
     // Record the right number of operands.
@@ -616,7 +616,7 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
       // It is the last operand recorded.
       assert(NextRecordedOperandNo > 1 &&
              "Should have recorded input/result chains at least!");
-      MatchedChainNodes.push_back(NextRecordedOperandNo-1);
+      MatchedChainNodes.push_back(NextRecordedOperandNo - 1);
     }
 
     // TODO: Complex patterns can't have output glues, if they did, we'd want
@@ -626,31 +626,30 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 // Node Result Generation
 //===----------------------------------------------------------------------===//
 
-void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N,
-                                          SmallVectorImpl<unsigned> &ResultOps){
-  assert(!N->getName().empty() && "Operand not named!");
+void MatcherGen::EmitResultOfNamedOperand(
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &ResultOps) {
+  assert(!N.getName().empty() && "Operand not named!");
 
-  if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) {
+  if (unsigned SlotNo = NamedComplexPatternOperands[N.getName()]) {
     // Complex operands have already been completely selected, just find the
     // right slot ant add the arguments directly.
-    for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+    for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i)
       ResultOps.push_back(SlotNo - 1 + i);
 
     return;
   }
 
-  unsigned SlotNo = getNamedArgumentSlot(N->getName());
+  unsigned SlotNo = getNamedArgumentSlot(N.getName());
 
   // If this is an 'imm' or 'fpimm' node, make sure to convert it to the target
   // version of the immediate so that it doesn't get selected due to some other
   // node use.
-  if (!N->isLeaf()) {
-    StringRef OperatorName = N->getOperator()->getName();
+  if (!N.isLeaf()) {
+    StringRef OperatorName = N.getOperator()->getName();
     if (OperatorName == "imm" || OperatorName == "fpimm") {
       AddMatcher(new EmitConvertToTargetMatcher(SlotNo));
       ResultOps.push_back(NextRecordedOperandNo++);
@@ -658,39 +657,38 @@ void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N,
     }
   }
 
-  for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+  for (unsigned i = 0; i < N.getNumMIResults(CGP); ++i)
     ResultOps.push_back(SlotNo + i);
 }
 
-void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
+void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N,
                                          SmallVectorImpl<unsigned> &ResultOps) {
-  assert(N->isLeaf() && "Must be a leaf");
+  assert(N.isLeaf() && "Must be a leaf");
 
-  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
-    AddMatcher(new EmitIntegerMatcher(II->getValue(), N->getSimpleType(0)));
+  if (IntInit *II = dyn_cast<IntInit>(N.getLeafValue())) {
+    AddMatcher(new EmitIntegerMatcher(II->getValue(), N.getSimpleType(0)));
     ResultOps.push_back(NextRecordedOperandNo++);
     return;
   }
 
   // If this is an explicit register reference, handle it.
-  if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
+  if (DefInit *DI = dyn_cast<DefInit>(N.getLeafValue())) {
     Record *Def = DI->getDef();
     if (Def->isSubClassOf("Register")) {
-      const CodeGenRegister *Reg =
-        CGP.getTargetInfo().getRegBank().getReg(Def);
-      AddMatcher(new EmitRegisterMatcher(Reg, N->getSimpleType(0)));
+      const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def);
+      AddMatcher(new EmitRegisterMatcher(Reg, N.getSimpleType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
     }
 
     if (Def->getName() == "zero_reg") {
-      AddMatcher(new EmitRegisterMatcher(nullptr, N->getSimpleType(0)));
+      AddMatcher(new EmitRegisterMatcher(nullptr, N.getSimpleType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
     }
 
     if (Def->getName() == "undef_tied_input") {
-      MVT::SimpleValueType ResultVT = N->getSimpleType(0);
+      MVT::SimpleValueType ResultVT = N.getSimpleType(0);
       auto IDOperandNo = NextRecordedOperandNo++;
       Record *ImpDef = Def->getRecords().getDef("IMPLICIT_DEF");
       CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(ImpDef);
@@ -743,25 +741,23 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
   }
 
   errs() << "unhandled leaf node:\n";
-  N->dump();
+  N.dump();
 }
 
-static bool
-mayInstNodeLoadOrStore(const TreePatternNode *N,
-                       const CodeGenDAGPatterns &CGP) {
-  Record *Op = N->getOperator();
+static bool mayInstNodeLoadOrStore(const TreePatternNode &N,
+                                   const CodeGenDAGPatterns &CGP) {
+  Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   return II.mayLoad || II.mayStore;
 }
 
-static unsigned
-numNodesThatMayLoadOrStore(const TreePatternNode *N,
-                           const CodeGenDAGPatterns &CGP) {
-  if (N->isLeaf())
+static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
+                                           const CodeGenDAGPatterns &CGP) {
+  if (N.isLeaf())
     return 0;
 
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
   if (!OpRec->isSubClassOf("Instruction"))
     return 0;
 
@@ -769,32 +765,31 @@ numNodesThatMayLoadOrStore(const TreePatternNode *N,
   if (mayInstNodeLoadOrStore(N, CGP))
     ++Count;
 
-  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
-    Count += numNodesThatMayLoadOrStore(N->getChild(i), CGP);
+  for (unsigned i = 0, e = N.getNumChildren(); i != e; ++i)
+    Count += numNodesThatMayLoadOrStore(N.getChild(i), CGP);
 
   return Count;
 }
 
-void MatcherGen::
-EmitResultInstructionAsOperand(const TreePatternNode *N,
-                               SmallVectorImpl<unsigned> &OutputOps) {
-  Record *Op = N->getOperator();
+void MatcherGen::EmitResultInstructionAsOperand(
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &OutputOps) {
+  Record *Op = N.getOperator();
   const CodeGenTarget &CGT = CGP.getTargetInfo();
   CodeGenInstruction &II = CGT.getInstruction(Op);
   const DAGInstruction &Inst = CGP.getInstruction(Op);
 
-  bool isRoot = N == Pattern.getDstPattern();
+  bool isRoot = &N == &Pattern.getDstPattern();
 
   // TreeHasOutGlue - True if this tree has glue.
   bool TreeHasInGlue = false, TreeHasOutGlue = false;
   if (isRoot) {
-    const TreePatternNode *SrcPat = Pattern.getSrcPattern();
-    TreeHasInGlue = SrcPat->TreeHasProperty(SDNPOptInGlue, CGP) ||
-                    SrcPat->TreeHasProperty(SDNPInGlue, CGP);
+    const TreePatternNode &SrcPat = Pattern.getSrcPattern();
+    TreeHasInGlue = SrcPat.TreeHasProperty(SDNPOptInGlue, CGP) ||
+                    SrcPat.TreeHasProperty(SDNPInGlue, CGP);
 
     // FIXME2: this is checking the entire pattern, not just the node in
     // question, doing this just for the root seems like a total hack.
-    TreeHasOutGlue = SrcPat->TreeHasProperty(SDNPOutGlue, CGP);
+    TreeHasOutGlue = SrcPat.TreeHasProperty(SDNPOutGlue, CGP);
   }
 
   // NumResults - This is the number of results produced by the instruction in
@@ -823,22 +818,21 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // filled in with their defaults unconditionally.
   unsigned NonOverridableOperands = NumFixedOperands;
   while (NonOverridableOperands > NumResults &&
-         CGP.operandHasDefault(II.Operands[NonOverridableOperands-1].Rec))
+         CGP.operandHasDefault(II.Operands[NonOverridableOperands - 1].Rec))
     --NonOverridableOperands;
 
-  for (unsigned InstOpNo = NumResults, e = NumFixedOperands;
-       InstOpNo != e; ++InstOpNo) {
+  for (unsigned InstOpNo = NumResults, e = NumFixedOperands; InstOpNo != e;
+       ++InstOpNo) {
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
     if (CGP.operandHasDefault(OperandNode) &&
-        (InstOpNo < NonOverridableOperands || ChildNo >= N->getNumChildren())) {
+        (InstOpNo < NonOverridableOperands || ChildNo >= N.getNumChildren())) {
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
-      const DAGDefaultOperand &DefaultOp
-        = CGP.getDefaultOperand(OperandNode);
+      const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode);
       for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
-        EmitResultOperand(DefaultOp.DefaultOps[i].get(), InstOps);
+        EmitResultOperand(*DefaultOp.DefaultOps[i], InstOps);
       continue;
     }
 
@@ -857,15 +851,15 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
 
     unsigned FinalNumOps = InstOps.size() + NumSubOps;
     while (InstOps.size() < FinalNumOps) {
-      const TreePatternNode *Child = N->getChild(ChildNo);
+      const TreePatternNode &Child = N.getChild(ChildNo);
       unsigned BeforeAddingNumOps = InstOps.size();
       EmitResultOperand(Child, InstOps);
       assert(InstOps.size() > BeforeAddingNumOps && "Didn't add any operands");
 
       // If the operand is an instruction and it produced multiple results, just
       // take the first one.
-      if (!Child->isLeaf() && Child->getOperator()->isSubClassOf("Instruction"))
-        InstOps.resize(BeforeAddingNumOps+1);
+      if (!Child.isLeaf() && Child.getOperator()->isSubClassOf("Instruction"))
+        InstOps.resize(BeforeAddingNumOps + 1);
 
       ++ChildNo;
     }
@@ -877,8 +871,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // above. Emit the remaining instructions implicitly added by the use for
   // variable_ops.
   if (II.Operands.isVariadic) {
-    for (unsigned I = ChildNo, E = N->getNumChildren(); I < E; ++I)
-      EmitResultOperand(N->getChild(I), InstOps);
+    for (unsigned I = ChildNo, E = N.getNumChildren(); I < E; ++I)
+      EmitResultOperand(N.getChild(I), InstOps);
   }
 
   // If this node has input glue or explicitly specified input physregs, we
@@ -889,9 +883,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
     // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src).
     for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) {
       const CodeGenRegister *Reg =
-        CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
-      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second,
-                                          Reg));
+          CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
+      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, Reg));
     }
 
     // Even if the node has no other glue inputs, the resultant node must be
@@ -903,8 +896,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
 
   // Determine the result types.
   SmallVector<MVT::SimpleValueType, 4> ResultVTs;
-  for (unsigned i = 0, e = N->getNumTypes(); i != e; ++i)
-    ResultVTs.push_back(N->getSimpleType(i));
+  for (unsigned i = 0, e = N.getNumTypes(); i != e; ++i)
+    ResultVTs.push_back(N.getSimpleType(i));
 
   // If this is the root instruction of a pattern that has physical registers in
   // its result pattern, add output VTs for them.  For example, X86 has:
@@ -919,7 +912,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       HandledReg = II.ImplicitDefs[0];
 
     for (Record *Reg : Pattern.getDstRegs()) {
-      if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue;
+      if (!Reg->isSubClassOf("Register") || Reg == HandledReg)
+        continue;
       ResultVTs.push_back(getRegisterValueType(Reg, CGT));
     }
   }
@@ -928,9 +922,8 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // a node that is variadic, mark the generated node as variadic so that it
   // gets the excess operands from the input DAG.
   int NumFixedArityOperands = -1;
-  if (isRoot &&
-      Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP))
-    NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
+  if (isRoot && Pattern.getSrcPattern().NodeHasProperty(SDNPVariadic, CGP))
+    NumFixedArityOperands = Pattern.getSrcPattern().getNumChildren();
 
   // If this is the root node and multiple matched nodes in the input pattern
   // have MemRefs in them, have the interpreter collect them and plop them onto
@@ -940,22 +933,22 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   // FIXME3: This is actively incorrect for result patterns with multiple
   // memory-referencing instructions.
   bool PatternHasMemOperands =
-    Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+      Pattern.getSrcPattern().TreeHasProperty(SDNPMemOperand, CGP);
 
   bool NodeHasMemRefs = false;
   if (PatternHasMemOperands) {
     unsigned NumNodesThatLoadOrStore =
-      numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
-    bool NodeIsUniqueLoadOrStore = mayInstNodeLoadOrStore(N, CGP) &&
-                                   NumNodesThatLoadOrStore == 1;
+        numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
+    bool NodeIsUniqueLoadOrStore =
+        mayInstNodeLoadOrStore(N, CGP) && NumNodesThatLoadOrStore == 1;
     NodeHasMemRefs =
-      NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
-                                             NumNodesThatLoadOrStore != 1));
+        NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
+                                               NumNodesThatLoadOrStore != 1));
   }
 
   // Determine whether we need to attach a chain to this node.
   bool NodeHasChain = false;
-  if (Pattern.getSrcPattern()->TreeHasProperty(SDNPHasChain, CGP)) {
+  if (Pattern.getSrcPattern().TreeHasProperty(SDNPHasChain, CGP)) {
     // For some instructions, we were able to infer from the pattern whether
     // they should have a chain.  Otherwise, attach the chain to the root.
     //
@@ -982,15 +975,15 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
 
   // The non-chain and non-glue results of the newly emitted node get recorded.
   for (unsigned i = 0, e = ResultVTs.size(); i != e; ++i) {
-    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue) break;
+    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue)
+      break;
     OutputOps.push_back(NextRecordedOperandNo++);
   }
 }
 
-void MatcherGen::
-EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
-                               SmallVectorImpl<unsigned> &ResultOps) {
-  assert(N->getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?");
+void MatcherGen::EmitResultSDNodeXFormAsOperand(
+    const TreePatternNode &N, SmallVectorImpl<unsigned> &ResultOps) {
+  assert(N.getOperator()->isSubClassOf("SDNodeXForm") && "Not SDNodeXForm?");
 
   // Emit the operand.
   SmallVector<unsigned, 8> InputOps;
@@ -998,31 +991,31 @@ EmitResultSDNodeXFormAsOperand(const TreePatternNode *N,
   // FIXME2: Could easily generalize this to support multiple inputs and outputs
   // to the SDNodeXForm.  For now we just support one input and one output like
   // the old instruction selector.
-  assert(N->getNumChildren() == 1);
-  EmitResultOperand(N->getChild(0), InputOps);
+  assert(N.getNumChildren() == 1);
+  EmitResultOperand(N.getChild(0), InputOps);
 
   // The input currently must have produced exactly one result.
   assert(InputOps.size() == 1 && "Unexpected input to SDNodeXForm");
 
-  AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N->getOperator()));
+  AddMatcher(new EmitNodeXFormMatcher(InputOps[0], N.getOperator()));
   ResultOps.push_back(NextRecordedOperandNo++);
 }
 
-void MatcherGen::EmitResultOperand(const TreePatternNode *N,
+void MatcherGen::EmitResultOperand(const TreePatternNode &N,
                                    SmallVectorImpl<unsigned> &ResultOps) {
   // This is something selected from the pattern we matched.
-  if (!N->getName().empty())
+  if (!N.getName().empty())
     return EmitResultOfNamedOperand(N, ResultOps);
 
-  if (N->isLeaf())
+  if (N.isLeaf())
     return EmitResultLeafAsOperand(N, ResultOps);
 
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
   if (OpRec->isSubClassOf("Instruction"))
     return EmitResultInstructionAsOperand(N, ResultOps);
   if (OpRec->isSubClassOf("SDNodeXForm"))
     return EmitResultSDNodeXFormAsOperand(N, ResultOps);
-  errs() << "Unknown result node to emit code for: " << *N << '\n';
+  errs() << "Unknown result node to emit code for: " << N << '\n';
   PrintFatalError("Unknown node in result pattern!");
 }
 
@@ -1043,24 +1036,25 @@ void MatcherGen::EmitResultCode() {
   // just lop them off.  This doesn't need to worry about glue or chains, just
   // explicit results.
   //
-  unsigned NumSrcResults = Pattern.getSrcPattern()->getNumTypes();
+  unsigned NumSrcResults = Pattern.getSrcPattern().getNumTypes();
 
   // If the pattern also has (implicit) results, count them as well.
   if (!Pattern.getDstRegs().empty()) {
     // If the root came from an implicit def in the instruction handling stuff,
     // don't re-add it.
     Record *HandledReg = nullptr;
-    const TreePatternNode *DstPat = Pattern.getDstPattern();
-    if (!DstPat->isLeaf() &&DstPat->getOperator()->isSubClassOf("Instruction")){
+    const TreePatternNode &DstPat = Pattern.getDstPattern();
+    if (!DstPat.isLeaf() && DstPat.getOperator()->isSubClassOf("Instruction")) {
       const CodeGenTarget &CGT = CGP.getTargetInfo();
-      CodeGenInstruction &II = CGT.getInstruction(DstPat->getOperator());
+      CodeGenInstruction &II = CGT.getInstruction(DstPat.getOperator());
 
       if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other)
         HandledReg = II.ImplicitDefs[0];
     }
 
     for (Record *Reg : Pattern.getDstRegs()) {
-      if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue;
+      if (!Reg->isSubClassOf("Register") || Reg == HandledReg)
+        continue;
       ++NumSrcResults;
     }
   }
@@ -1068,16 +1062,15 @@ void MatcherGen::EmitResultCode() {
   SmallVector<unsigned, 8> Results(Ops);
 
   // Apply result permutation.
-  for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern()->getNumResults();
+  for (unsigned ResNo = 0; ResNo < Pattern.getDstPattern().getNumResults();
        ++ResNo) {
-    Results[ResNo] = Ops[Pattern.getDstPattern()->getResultIndex(ResNo)];
+    Results[ResNo] = Ops[Pattern.getDstPattern().getResultIndex(ResNo)];
   }
 
   Results.resize(NumSrcResults);
   AddMatcher(new CompleteMatchMatcher(Results, Pattern));
 }
 
-
 /// ConvertPatternToMatcher - Create the matcher for the specified pattern with
 /// the specified variant.  If the variant number is invalid, this returns null.
 Matcher *llvm::ConvertPatternToMatcher(const PatternToMatch &Pattern,
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index c4c25dc1a5fd..f786d41c88d7 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -99,7 +99,7 @@ static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
       const PatternToMatch &Pattern = CM->getPattern();
 
       if (!EN->hasChain() &&
-          Pattern.getSrcPattern()->NodeHasProperty(SDNPHasChain, CGP))
+          Pattern.getSrcPattern().NodeHasProperty(SDNPHasChain, CGP))
         ResultsMatch = false;
 
       // If the matched node has glue and the output root doesn't, we can't
@@ -109,7 +109,7 @@ static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
       // because the code in the pattern generator doesn't handle it right.  We
       // do it anyway for thoroughness.
       if (!EN->hasOutGlue() &&
-          Pattern.getSrcPattern()->NodeHasProperty(SDNPOutGlue, CGP))
+          Pattern.getSrcPattern().NodeHasProperty(SDNPOutGlue, CGP))
         ResultsMatch = false;
 
 #if 0
@@ -311,10 +311,9 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
         // Don't print if it's obvious nothing extract could be merged anyway.
         std::next(J) != E) {
       LLVM_DEBUG(errs() << "Couldn't merge this:\n"; Optn->print(errs(), 4);
-                 errs() << "into this:\n";
-                 (*J)->print(errs(), 4);
+                 errs() << "into this:\n"; (*J)->print(errs(), 4);
                  (*std::next(J))->printOne(errs());
-                 if (std::next(J, 2) != E) (*std::next(J, 2))->printOne(errs());
+                 if (std::next(J, 2) != E)(*std::next(J, 2))->printOne(errs());
                  errs() << "\n");
     }
 
diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp
index 54ad81cbebe8..0d22ad251fba 100644
--- a/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/llvm/utils/TableGen/DFAEmitter.cpp
@@ -147,8 +147,8 @@ void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
 
   OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
   OS << "// The initial state is 1, not zero.\n";
-  OS << "const std::array<" << Name << "Transition, "
-     << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
+  OS << "const std::array<" << Name << "Transition, " << DfaTransitions.size()
+     << "> " << Name << "Transitions = {{\n";
   for (auto &KV : DfaTransitions) {
     dfa_state_type From = KV.first.first;
     dfa_state_type To = KV.second.first;
@@ -284,7 +284,7 @@ void Automaton::emit(raw_ostream &OS) {
   }
   LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
                     << " states with " << NumTransitions << " transitions.\n");
-  (void) NumTransitions;
+  (void)NumTransitions;
 
   const auto &ActionTypes = Transitions.back().getTypes();
   OS << "// The type of an action in the " << Name << " automaton.\n";
@@ -346,9 +346,7 @@ bool Transition::canTransitionFrom(uint64_t State) {
   return false;
 }
 
-uint64_t Transition::transitionFrom(uint64_t State) {
-  return State | NewState;
-}
+uint64_t Transition::transitionFrom(uint64_t State) { return State | NewState; }
 
 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
 
diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
index 64c7884616a5..26ea1846ffae 100644
--- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -72,8 +72,7 @@ public:
   DFAPacketizerEmitter(RecordKeeper &R);
 
   // Construct a map of function unit names to bits.
-  int collectAllFuncUnits(
-      ArrayRef<const CodeGenProcModel *> ProcModels);
+  int collectAllFuncUnits(ArrayRef<const CodeGenProcModel *> ProcModels);
 
   // Construct a map from a combo function unit bit to the bits of all included
   // functional units.
@@ -129,7 +128,8 @@ int DFAPacketizerEmitter::collectAllFuncUnits(
   return totalFUs;
 }
 
-int DFAPacketizerEmitter::collectAllComboFuncs(ArrayRef<Record *> ComboFuncList) {
+int DFAPacketizerEmitter::collectAllComboFuncs(
+    ArrayRef<Record *> ComboFuncList) {
   LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
                        "----------------------\n");
   LLVM_DEBUG(dbgs() << "collectAllComboFuncs");
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 475a57a0cadf..3378a904ac40 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -30,7 +30,7 @@ struct DXILShaderModel {
   int Minor = 0;
 };
 
-struct DXILParam {
+struct DXILParameter {
   int Pos; // position in parameter list
   ParameterKind Kind;
   StringRef Name; // short, unique name
@@ -38,28 +38,27 @@ struct DXILParam {
   bool IsConst;   // whether this argument requires a constant value in the IR
   StringRef EnumName; // the name of the enum type if applicable
   int MaxValue;       // the maximum value for this parameter if applicable
-  DXILParam(const Record *R);
+  DXILParameter(const Record *R);
 };
 
-struct DXILOperationData {
-  StringRef Name; // short, unique name
-
-  StringRef DXILOp;    // name of DXIL operation
-  int DXILOpID;        // ID of DXIL operation
-  StringRef DXILClass; // name of the opcode class
-  StringRef Category;  // classification for this instruction
-  StringRef Doc;       // the documentation description of this instruction
-
-  SmallVector<DXILParam> Params; // the operands that this instruction takes
-  StringRef OverloadTypes;       // overload types if applicable
-  StringRef FnAttr;              // attribute shorthands: rn=does not access
-                                 // memory,ro=only reads from memory
-  StringRef Intrinsic; // The llvm intrinsic map to DXILOp. Default is "" which
-                       // means no map exist
-  bool IsDeriv = false;    // whether this is some kind of derivative
+struct DXILOperationDesc {
+  StringRef OpName;   // name of DXIL operation
+  int OpCode;         // ID of DXIL operation
+  StringRef OpClass;  // name of the opcode class
+  StringRef Category; // classification for this instruction
+  StringRef Doc;      // the documentation description of this instruction
+
+  SmallVector<DXILParameter> Params; // the operands that this instruction takes
+  SmallVector<ParameterKind> OverloadTypes; // overload types if applicable
+  StringRef Attr; // operation attribute; reference to string representation
+                  // of llvm::Attribute::AttrKind
+  StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
+                        // means no map exists
+  bool IsDeriv = false; // whether this is some kind of derivative
   bool IsGradient = false; // whether this requires a gradient calculation
   bool IsFeedback = false; // whether this is a sampler feedback op
-  bool IsWave = false;     // whether this requires in-wave, cross-lane functionality
+  bool IsWave =
+      false; // whether this requires in-wave, cross-lane functionality
   bool RequiresUniformInputs = false; // whether this operation requires that
                                       // all of its inputs are uniform across
                                       // the wave
@@ -71,38 +70,32 @@ struct DXILOperationData {
   int OverloadParamIndex; // parameter index which control the overload.
                           // When < 0, should be only 1 overload type.
   SmallVector<StringRef, 4> counters; // counters for this inst.
-  DXILOperationData(const Record *R) {
-    Name = R->getValueAsString("Name");
-    DXILOp = R->getValueAsString("OpName");
-    DXILOpID = R->getValueAsInt("OpCode");
-    DXILClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
-    Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
-
-    if (R->getValue("llvm_intrinsic")) {
-      auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
-      auto DefName = IntrinsicDef->getName();
-      assert(DefName.starts_with("int_") && "invalid intrinsic name");
-      // Remove the int_ from intrinsic name.
-      Intrinsic = DefName.substr(4);
-    }
-
-    Doc = R->getValueAsString("Doc");
-
-    ListInit *ParamList = R->getValueAsListInit("Params");
-    OverloadParamIndex = -1;
-    for (unsigned I = 0; I < ParamList->size(); ++I) {
-      Record *Param = ParamList->getElementAsRecord(I);
-      Params.emplace_back(DXILParam(Param));
-      auto &CurParam = Params.back();
-      if (CurParam.Kind >= ParameterKind::OVERLOAD)
-        OverloadParamIndex = I;
-    }
-    OverloadTypes = R->getValueAsString("OverloadTypes");
-    FnAttr = R->getValueAsString("Attributes");
-  }
+  DXILOperationDesc(const Record *);
 };
 } // end anonymous namespace
 
+// Convert DXIL type name string to dxil::ParameterKind
+//
+// @param typeNameStr Type name string
+// @return ParameterKind as defined in llvm/Support/DXILABI.h
+static ParameterKind getDXILTypeNameToKind(StringRef typeNameStr) {
+  return StringSwitch<ParameterKind>(typeNameStr)
+      .Case("voidTy", ParameterKind::VOID)
+      .Case("f16Ty", ParameterKind::HALF)
+      .Case("f32Ty", ParameterKind::FLOAT)
+      .Case("f64Ty", ParameterKind::DOUBLE)
+      .Case("i1Ty", ParameterKind::I1)
+      .Case("i8Ty", ParameterKind::I8)
+      .Case("i16Ty", ParameterKind::I16)
+      .Case("i32Ty", ParameterKind::I32)
+      .Case("i64Ty", ParameterKind::I64)
+      .Case("overloadTy", ParameterKind::OVERLOAD)
+      .Case("handleTy", ParameterKind::DXIL_HANDLE)
+      .Case("cbufferRetTy", ParameterKind::CBUFFER_RET)
+      .Case("resourceRetTy", ParameterKind::RESOURCE_RET)
+      .Default(ParameterKind::INVALID);
+}
+
 static ParameterKind parameterTypeNameToKind(StringRef Name) {
   return StringSwitch<ParameterKind>(Name)
       .Case("void", ParameterKind::VOID)
@@ -121,10 +114,44 @@ static ParameterKind parameterTypeNameToKind(StringRef Name) {
       .Default(ParameterKind::INVALID);
 }
 
-DXILParam::DXILParam(const Record *R) {
+DXILOperationDesc::DXILOperationDesc(const Record *R) {
+  OpName = R->getValueAsString("OpName");
+  OpCode = R->getValueAsInt("OpCode");
+  OpClass = R->getValueAsDef("OpClass")->getValueAsString("Name");
+  Category = R->getValueAsDef("OpCategory")->getValueAsString("Name");
+
+  if (R->getValue("llvm_intrinsic")) {
+    auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
+    auto DefName = IntrinsicDef->getName();
+    assert(DefName.starts_with("int_") && "invalid intrinsic name");
+    // Remove the int_ from intrinsic name.
+    Intrinsic = DefName.substr(4);
+  }
+
+  Doc = R->getValueAsString("Doc");
+
+  ListInit *ParamList = R->getValueAsListInit("Params");
+  OverloadParamIndex = -1;
+  for (unsigned I = 0; I < ParamList->size(); ++I) {
+    Record *Param = ParamList->getElementAsRecord(I);
+    Params.emplace_back(DXILParameter(Param));
+    auto &CurParam = Params.back();
+    if (CurParam.Kind >= ParameterKind::OVERLOAD)
+      OverloadParamIndex = I;
+  }
+  ListInit *OverloadTypeList = R->getValueAsListInit("OverloadTypes");
+
+  for (unsigned I = 0; I < OverloadTypeList->size(); ++I) {
+    Record *R = OverloadTypeList->getElementAsRecord(I);
+    OverloadTypes.emplace_back(getDXILTypeNameToKind(R->getNameInitAsString()));
+  }
+  Attr = StringRef(R->getValue("Attribute")->getNameInitAsString());
+}
+
+DXILParameter::DXILParameter(const Record *R) {
   Name = R->getValueAsString("Name");
   Pos = R->getValueAsInt("Pos");
-  Kind = parameterTypeNameToKind(R->getValueAsString("LLVMType"));
+  Kind = parameterTypeNameToKind(R->getValueAsString("Type"));
   if (R->getValue("Doc"))
     Doc = R->getValueAsString("Doc");
   IsConst = R->getValueAsBit("IsConstant");
@@ -166,10 +193,9 @@ static std::string parameterKindToString(ParameterKind Kind) {
   llvm_unreachable("Unknown llvm::dxil::ParameterKind enum");
 }
 
-static void emitDXILOpEnum(DXILOperationData &DXILOp, raw_ostream &OS) {
+static void emitDXILOpEnum(DXILOperationDesc &Op, raw_ostream &OS) {
   // Name = ID, // Doc
-  OS << DXILOp.Name << " = " << DXILOp.DXILOpID << ", // " << DXILOp.Doc
-     << "\n";
+  OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
 }
 
 static std::string buildCategoryStr(StringSet<> &Cetegorys) {
@@ -182,14 +208,14 @@ static std::string buildCategoryStr(StringSet<> &Cetegorys) {
 }
 
 // Emit enum declaration for DXIL.
-static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
                           raw_ostream &OS) {
   // Sort by Category + OpName.
-  llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) {
+  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
     // Group by Category first.
     if (A.Category == B.Category)
       // Inside same Category, order by OpName.
-      return A.DXILOp < B.DXILOp;
+      return A.OpName < B.OpName;
     else
       return A.Category < B.Category;
   });
@@ -199,18 +225,18 @@ static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
 
   StringMap<StringSet<>> ClassMap;
   StringRef PrevCategory = "";
-  for (auto &DXILOp : DXILOps) {
-    StringRef Category = DXILOp.Category;
+  for (auto &Op : Ops) {
+    StringRef Category = Op.Category;
     if (Category != PrevCategory) {
       OS << "\n// " << Category << "\n";
       PrevCategory = Category;
     }
-    emitDXILOpEnum(DXILOp, OS);
-    auto It = ClassMap.find(DXILOp.DXILClass);
+    emitDXILOpEnum(Op, OS);
+    auto It = ClassMap.find(Op.OpClass);
     if (It != ClassMap.end()) {
-      It->second.insert(DXILOp.Category);
+      It->second.insert(Op.Category);
     } else {
-      ClassMap[DXILOp.DXILClass].insert(DXILOp.Category);
+      ClassMap[Op.OpClass].insert(Op.Category);
     }
   }
 
@@ -253,55 +279,68 @@ static void emitDXILEnums(std::vector<DXILOperationData> &DXILOps,
 }
 
 // Emit map from llvm intrinsic to DXIL operation.
-static void emitDXILIntrinsicMap(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
                                  raw_ostream &OS) {
   OS << "\n";
   // FIXME: use array instead of SmallDenseMap.
   OS << "static const SmallDenseMap<Intrinsic::ID, dxil::OpCode> LowerMap = "
         "{\n";
-  for (auto &DXILOp : DXILOps) {
-    if (DXILOp.Intrinsic.empty())
+  for (auto &Op : Ops) {
+    if (Op.Intrinsic.empty())
       continue;
     // {Intrinsic::sin, dxil::OpCode::Sin},
-    OS << "  { Intrinsic::" << DXILOp.Intrinsic
-       << ", dxil::OpCode::" << DXILOp.DXILOp << "},\n";
+    OS << "  { Intrinsic::" << Op.Intrinsic << ", dxil::OpCode::" << Op.OpName
+       << "},\n";
   }
   OS << "};\n";
   OS << "\n";
 }
 
-static std::string emitDXILOperationFnAttr(StringRef FnAttr) {
-  return StringSwitch<std::string>(FnAttr)
-      .Case("rn", "Attribute::ReadNone")
-      .Case("ro", "Attribute::ReadOnly")
+// Convert operation attribute string to Attribute enum
+//
+// @param Attr string reference
+// @return std::string Attribute enum string
+static std::string emitDXILOperationAttr(StringRef Attr) {
+  return StringSwitch<std::string>(Attr)
+      .Case("ReadNone", "Attribute::ReadNone")
+      .Case("ReadOnly", "Attribute::ReadOnly")
       .Default("Attribute::None");
 }
 
-static std::string getOverloadKind(StringRef Overload) {
-  return StringSwitch<std::string>(Overload)
-      .Case("half", "OverloadKind::HALF")
-      .Case("float", "OverloadKind::FLOAT")
-      .Case("double", "OverloadKind::DOUBLE")
-      .Case("i1", "OverloadKind::I1")
-      .Case("i16", "OverloadKind::I16")
-      .Case("i32", "OverloadKind::I32")
-      .Case("i64", "OverloadKind::I64")
-      .Case("udt", "OverloadKind::UserDefineType")
-      .Case("obj", "OverloadKind::ObjectType")
-      .Default("OverloadKind::VOID");
+static std::string overloadKindStr(ParameterKind Overload) {
+  switch (Overload) {
+  case ParameterKind::HALF:
+    return "OverloadKind::HALF";
+  case ParameterKind::FLOAT:
+    return "OverloadKind::FLOAT";
+  case ParameterKind::DOUBLE:
+    return "OverloadKind::DOUBLE";
+  case ParameterKind::I1:
+    return "OverloadKind::I1";
+  case ParameterKind::I8:
+    return "OverloadKind::I8";
+  case ParameterKind::I16:
+    return "OverloadKind::I16";
+  case ParameterKind::I32:
+    return "OverloadKind::I32";
+  case ParameterKind::I64:
+    return "OverloadKind::I64";
+  case ParameterKind::VOID:
+    return "OverloadKind::VOID";
+  default:
+    return "OverloadKind::UNKNOWN";
+  }
 }
 
-static std::string getDXILOperationOverload(StringRef Overloads) {
-  SmallVector<StringRef> OverloadStrs;
-  Overloads.split(OverloadStrs, ';', /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+static std::string
+getDXILOperationOverloads(SmallVector<ParameterKind> Overloads) {
   // Format is: OverloadKind::FLOAT | OverloadKind::HALF
-  assert(!OverloadStrs.empty() && "Invalid overloads");
-  auto It = OverloadStrs.begin();
+  auto It = Overloads.begin();
   std::string Result;
   raw_string_ostream OS(Result);
-  OS << getOverloadKind(*It);
-  for (++It; It != OverloadStrs.end(); ++It) {
-    OS << " | " << getOverloadKind(*It);
+  OS << overloadKindStr(*It);
+  for (++It; It != Overloads.end(); ++It) {
+    OS << " | " << overloadKindStr(*It);
   }
   return OS.str();
 }
@@ -315,20 +354,20 @@ static std::string lowerFirstLetter(StringRef Name) {
   return LowerName;
 }
 
-static std::string getDXILOpClassName(StringRef DXILOpClass) {
+static std::string getDXILOpClassName(StringRef OpClass) {
   // Lower first letter expect for special case.
-  return StringSwitch<std::string>(DXILOpClass)
+  return StringSwitch<std::string>(OpClass)
       .Case("CBufferLoad", "cbufferLoad")
       .Case("CBufferLoadLegacy", "cbufferLoadLegacy")
       .Case("GSInstanceID", "gsInstanceID")
-      .Default(lowerFirstLetter(DXILOpClass));
+      .Default(lowerFirstLetter(OpClass));
 }
 
-static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
+static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
                                    raw_ostream &OS) {
-  // Sort by DXILOpID.
-  llvm::sort(DXILOps, [](DXILOperationData &A, DXILOperationData &B) {
-    return A.DXILOpID < B.DXILOpID;
+  // Sort by OpCode.
+  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
+    return A.OpCode < B.OpCode;
   });
 
   // Collect Names.
@@ -338,18 +377,18 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
 
   StringMap<SmallVector<ParameterKind>> ParameterMap;
   StringSet<> ClassSet;
-  for (auto &DXILOp : DXILOps) {
-    OpStrings.add(DXILOp.DXILOp.str());
+  for (auto &Op : Ops) {
+    OpStrings.add(Op.OpName.str());
 
-    if (ClassSet.contains(DXILOp.DXILClass))
+    if (ClassSet.contains(Op.OpClass))
       continue;
-    ClassSet.insert(DXILOp.DXILClass);
-    OpClassStrings.add(getDXILOpClassName(DXILOp.DXILClass));
+    ClassSet.insert(Op.OpClass);
+    OpClassStrings.add(getDXILOpClassName(Op.OpClass));
     SmallVector<ParameterKind> ParamKindVec;
-    for (auto &Param : DXILOp.Params) {
+    for (auto &Param : Op.Params) {
       ParamKindVec.emplace_back(Param.Kind);
     }
-    ParameterMap[DXILOp.DXILClass] = ParamKindVec;
+    ParameterMap[Op.OpClass] = ParamKindVec;
     Parameters.add(ParamKindVec);
   }
 
@@ -363,26 +402,25 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
   // OpCodeClassNameIndex,
   // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0,
   // 3, ParameterTableOffset},
-  OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode DXILOp) "
+  OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode Op) "
         "{\n";
 
   OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
-  for (auto &DXILOp : DXILOps) {
-    OS << "  { dxil::OpCode::" << DXILOp.DXILOp << ", "
-       << OpStrings.get(DXILOp.DXILOp.str())
-       << ", OpCodeClass::" << DXILOp.DXILClass << ", "
-       << OpClassStrings.get(getDXILOpClassName(DXILOp.DXILClass)) << ", "
-       << getDXILOperationOverload(DXILOp.OverloadTypes) << ", "
-       << emitDXILOperationFnAttr(DXILOp.FnAttr) << ", "
-       << DXILOp.OverloadParamIndex << ", " << DXILOp.Params.size() << ", "
-       << Parameters.get(ParameterMap[DXILOp.DXILClass]) << " },\n";
+  for (auto &Op : Ops) {
+    OS << "  { dxil::OpCode::" << Op.OpName << ", "
+       << OpStrings.get(Op.OpName.str()) << ", OpCodeClass::" << Op.OpClass
+       << ", " << OpClassStrings.get(getDXILOpClassName(Op.OpClass)) << ", "
+       << getDXILOperationOverloads(Op.OverloadTypes) << ", "
+       << emitDXILOperationAttr(Op.Attr) << ", " << Op.OverloadParamIndex
+       << ", " << Op.Params.size() << ", "
+       << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
   }
   OS << "  };\n";
 
   OS << "  // FIXME: change search to indexing with\n";
-  OS << "  // DXILOp once all DXIL op is added.\n";
+  OS << "  // Op once all DXIL operations are added.\n";
   OS << "  OpCodeProperty TmpProp;\n";
-  OS << "  TmpProp.OpCode = DXILOp;\n";
+  OS << "  TmpProp.OpCode = Op;\n";
   OS << "  const OpCodeProperty *Prop =\n";
   OS << "      llvm::lower_bound(OpCodeProps, TmpProp,\n";
   OS << "                        [](const OpCodeProperty &A, const "
@@ -394,12 +432,12 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
   OS << "}\n\n";
 
   // Emit the string tables.
-  OS << "static const char *getOpCodeName(dxil::OpCode DXILOp) {\n\n";
+  OS << "static const char *getOpCodeName(dxil::OpCode Op) {\n\n";
 
   OpStrings.emitStringLiteralDef(OS,
                                  "  static const char DXILOpCodeNameTable[]");
 
-  OS << "  auto *Prop = getOpCodeProperty(DXILOp);\n";
+  OS << "  auto *Prop = getOpCodeProperty(Op);\n";
   OS << "  unsigned Index = Prop->OpCodeNameOffset;\n";
   OS << "  return DXILOpCodeNameTable + Index;\n";
   OS << "}\n\n";
@@ -431,14 +469,14 @@ static void emitDXILOperationTable(std::vector<DXILOperationData> &DXILOps,
 }
 
 static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("DxilOperation");
+  std::vector<Record *> Ops = Records.getAllDerivedDefinitions("DXILOperation");
   OS << "// Generated code, do not edit.\n";
   OS << "\n";
 
-  std::vector<DXILOperationData> DXILOps;
+  std::vector<DXILOperationDesc> DXILOps;
   DXILOps.reserve(Ops.size());
   for (auto *Record : Ops) {
-    DXILOps.emplace_back(DXILOperationData(Record));
+    DXILOps.emplace_back(DXILOperationDesc(Record));
   }
 
   OS << "#ifdef DXIL_OP_ENUM\n";
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 591ee5c72887..2f28ccbe6d6c 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -53,7 +53,8 @@ using namespace llvm;
 namespace {
 
 STATISTIC(NumEncodings, "Number of encodings considered");
-STATISTIC(NumEncodingsLackingDisasm, "Number of encodings without disassembler info");
+STATISTIC(NumEncodingsLackingDisasm,
+          "Number of encodings without disassembler info");
 STATISTIC(NumInstructions, "Number of instructions considered");
 STATISTIC(NumEncodingsSupported, "Number of encodings supported");
 STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
@@ -61,7 +62,7 @@ STATISTIC(NumEncodingsOmitted, "Number of encodings omitted");
 struct EncodingField {
   unsigned Base, Width, Offset;
   EncodingField(unsigned B, unsigned W, unsigned O)
-    : Base(B), Width(W), Offset(O) { }
+      : Base(B), Width(W), Offset(O) {}
 };
 
 struct OperandInfo {
@@ -82,7 +83,7 @@ struct OperandInfo {
   typedef std::vector<EncodingField>::const_iterator const_iterator;
 
   const_iterator begin() const { return Fields.begin(); }
-  const_iterator end() const   { return Fields.end();   }
+  const_iterator end() const { return Fields.end(); }
 };
 
 typedef std::vector<uint8_t> DecoderTable;
@@ -141,8 +142,7 @@ public:
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates,
                              unsigned Indentation) const;
-  void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders,
+  void emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
                            unsigned Indentation) const;
 
   // run - Output the code emitter
@@ -173,9 +173,7 @@ static bool ValueSet(bit_value_t V) {
   return (V == BIT_TRUE || V == BIT_FALSE);
 }
 
-static bool ValueNotSet(bit_value_t V) {
-  return (V == BIT_UNSET);
-}
+static bool ValueNotSet(bit_value_t V) { return (V == BIT_UNSET); }
 
 static int Value(bit_value_t V) {
   return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1);
@@ -280,14 +278,14 @@ class FilterChooser;
 /// version and return the Opcode since the two have the same Asm format string.
 class Filter {
 protected:
-  const FilterChooser *Owner;// points to the FilterChooser who owns this filter
+  const FilterChooser
+      *Owner;        // points to the FilterChooser who owns this filter
   unsigned StartBit; // the starting bit position
-  unsigned NumBits; // number of bits to filter
-  bool Mixed; // a mixed region contains both set and unset bits
+  unsigned NumBits;  // number of bits to filter
+  bool Mixed;        // a mixed region contains both set and unset bits
 
   // Map of well-known segment value to the set of uid's with that value.
-  std::map<uint64_t, std::vector<EncodingIDAndOpcode>>
-      FilteredInstructions;
+  std::map<uint64_t, std::vector<EncodingIDAndOpcode>> FilteredInstructions;
 
   // Set of uid's with non-constant segment values.
   std::vector<EncodingIDAndOpcode> VariableInstructions;
@@ -471,7 +469,7 @@ protected:
   /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
   /// filter array as a series of chars.
   void dumpFilterArray(raw_ostream &o,
-                       const std::vector<bit_value_t> & filter) const;
+                       const std::vector<bit_value_t> &filter) const;
 
   /// dumpStack - dumpStack traverses the filter chooser chain and calls
   /// dumpFilterArray on each filter chooser up to the top level one.
@@ -504,11 +502,9 @@ protected:
 
   bool doesOpcodeNeedPredicate(unsigned Opc) const;
   unsigned getPredicateIndex(DecoderTableInfo &TableInfo, StringRef P) const;
-  void emitPredicateTableEntry(DecoderTableInfo &TableInfo,
-                               unsigned Opc) const;
+  void emitPredicateTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const;
 
-  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
-                              unsigned Opc) const;
+  void emitSoftFailTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const;
 
   // Emits table entries to decode the singleton.
   void emitSingletonTableEntry(DecoderTableInfo &TableInfo,
@@ -560,16 +556,15 @@ public:
 ///////////////////////////
 
 Filter::Filter(Filter &&f)
-  : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
-    FilteredInstructions(std::move(f.FilteredInstructions)),
-    VariableInstructions(std::move(f.VariableInstructions)),
-    FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered),
-    LastOpcFiltered(f.LastOpcFiltered) {
-}
+    : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
+      FilteredInstructions(std::move(f.FilteredInstructions)),
+      VariableInstructions(std::move(f.VariableInstructions)),
+      FilterChooserMap(std::move(f.FilterChooserMap)),
+      NumFiltered(f.NumFiltered), LastOpcFiltered(f.LastOpcFiltered) {}
 
 Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
                bool mixed)
-  : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
+    : Owner(&owner), StartBit(startBit), NumBits(numBits), Mixed(mixed) {
   assert(StartBit + NumBits - 1 < Owner->BitWidth);
 
   NumFiltered = 0;
@@ -598,8 +593,8 @@ Filter::Filter(FilterChooser &owner, unsigned startBit, unsigned numBits,
     }
   }
 
-  assert((FilteredInstructions.size() + VariableInstructions.size() > 0)
-         && "Filter returns no instruction categories");
+  assert((FilteredInstructions.size() + VariableInstructions.size() > 0) &&
+         "Filter returns no instruction categories");
 }
 
 // Divides the decoding task into sub tasks and delegates them to the
@@ -619,9 +614,11 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(std::make_pair(NO_FIXED_SEGMENTS_SENTINEL,
+    FilterChooserMap.insert(std::make_pair(
+        NO_FIXED_SEGMENTS_SENTINEL,
         std::make_unique<FilterChooser>(Owner->AllInstructions,
-            VariableInstructions, Owner->Operands, BitValueArray, *Owner)));
+                                        VariableInstructions, Owner->Operands,
+                                        BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.
@@ -646,8 +643,8 @@ void Filter::recurse() {
     // category of instructions.
     FilterChooserMap.insert(std::make_pair(
         Inst.first, std::make_unique<FilterChooser>(
-                                Owner->AllInstructions, Inst.second,
-                                Owner->Operands, BitValueArray, *Owner)));
+                        Owner->AllInstructions, Inst.second, Owner->Operands,
+                        BitValueArray, *Owner)));
   }
 }
 
@@ -655,8 +652,7 @@ static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups,
                                uint32_t DestIdx) {
   // Any NumToSkip fixups in the current scope can resolve to the
   // current location.
-  for (FixupList::const_reverse_iterator I = Fixups.rbegin(),
-                                         E = Fixups.rend();
+  for (FixupList::const_reverse_iterator I = Fixups.rbegin(), E = Fixups.rend();
        I != E; ++I) {
     // Calculate the distance from the byte following the fixup entry byte
     // to the destination. The Target is calculated from after the 16-bit
@@ -705,7 +701,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
       // Resolve any NumToSkip fixups in the current scope.
       resolveTableFixups(Table, CurScope, Table.size());
       CurScope.clear();
-      PrevFilter = 0;  // Don't re-process the filter's fallthrough.
+      PrevFilter = 0; // Don't re-process the filter's fallthrough.
     } else {
       Table.push_back(MCD::OPC_FilterValue);
       // Encode and emit the value to filter against.
@@ -731,7 +727,8 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const {
     // two as to account for the width of the NumToSkip field itself.
     if (PrevFilter) {
       uint32_t NumToSkip = Table.size() - PrevFilter - 3;
-      assert(NumToSkip < (1u << 24) && "disassembler decoding table too large!");
+      assert(NumToSkip < (1u << 24) &&
+             "disassembler decoding table too large!");
       Table[PrevFilter] = (uint8_t)NumToSkip;
       Table[PrevFilter + 1] = (uint8_t)(NumToSkip >> 8);
       Table[PrevFilter + 2] = (uint8_t)(NumToSkip >> 16);
@@ -771,7 +768,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
                                unsigned Indentation, unsigned BitWidth,
                                StringRef Namespace) const {
   OS.indent(Indentation) << "static const uint8_t DecoderTable" << Namespace
-    << BitWidth << "[] = {\n";
+                         << BitWidth << "[] = {\n";
 
   Indentation += 2;
 
@@ -807,7 +804,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
   DecoderTable::const_iterator I = Table.begin();
   DecoderTable::const_iterator E = Table.end();
   while (I != E) {
-    assert (I < E && "incomplete decode table entry!");
+    assert(I < E && "incomplete decode table entry!");
 
     uint64_t Pos = I - Table.begin();
     OS << "/* " << Pos << " */";
@@ -884,8 +881,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
                                    Table.data() + Table.size(), &ErrMsg);
       assert(ErrMsg == nullptr && "ULEB128 value too large!");
 
-      OS.indent(Indentation) << "MCD::OPC_" << (IsTry ? "Try" : "")
-        << "Decode, ";
+      OS.indent(Indentation)
+          << "MCD::OPC_" << (IsTry ? "Try" : "") << "Decode, ";
       I += emitULEB128(I, OS);
 
       // Decoder index.
@@ -967,15 +964,16 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   // The predicate function is just a big switch statement based on the
   // input predicate index.
   OS.indent(Indentation) << "static bool checkDecoderPredicate(unsigned Idx, "
-    << "const FeatureBitset &Bits) {\n";
+                         << "const FeatureBitset &Bits) {\n";
   Indentation += 2;
   if (!Predicates.empty()) {
     OS.indent(Indentation) << "switch (Idx) {\n";
-    OS.indent(Indentation) << "default: llvm_unreachable(\"Invalid index!\");\n";
+    OS.indent(Indentation)
+        << "default: llvm_unreachable(\"Invalid index!\");\n";
     unsigned Index = 0;
     for (const auto &Predicate : Predicates) {
       OS.indent(Indentation) << "case " << Index++ << ":\n";
-      OS.indent(Indentation+2) << "return (" << Predicate << ");\n";
+      OS.indent(Indentation + 2) << "return (" << Predicate << ");\n";
     }
     OS.indent(Indentation) << "}\n";
   } else {
@@ -993,7 +991,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   // input decoder index.
   OS.indent(Indentation) << "template <typename InsnType>\n";
   OS.indent(Indentation) << "static DecodeStatus decodeToMCInst(DecodeStatus S,"
-    << " unsigned Idx, InsnType insn, MCInst &MI,\n";
+                         << " unsigned Idx, InsnType insn, MCInst &MI,\n";
   OS.indent(Indentation)
       << "                                   uint64_t "
       << "Address, const MCDisassembler *Decoder, bool &DecodeComplete) {\n";
@@ -1012,7 +1010,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   for (const auto &Decoder : Decoders) {
     OS.indent(Indentation) << "case " << Index++ << ":\n";
     OS << Decoder;
-    OS.indent(Indentation+2) << "return S;\n";
+    OS.indent(Indentation + 2) << "return S;\n";
   }
   OS.indent(Indentation) << "}\n";
   Indentation -= 2;
@@ -1041,8 +1039,8 @@ bool FilterChooser::fieldFromInsn(uint64_t &Field, insn_t &Insn,
 
 /// dumpFilterArray - dumpFilterArray prints out debugging info for the given
 /// filter array as a series of chars.
-void FilterChooser::dumpFilterArray(raw_ostream &o,
-                                 const std::vector<bit_value_t> &filter) const {
+void FilterChooser::dumpFilterArray(
+    raw_ostream &o, const std::vector<bit_value_t> &filter) const {
   for (unsigned bitIndex = BitWidth; bitIndex > 0; bitIndex--) {
     switch (filter[bitIndex - 1]) {
     case BIT_UNFILTERED:
@@ -1096,7 +1094,8 @@ unsigned FilterChooser::getIslands(std::vector<unsigned> &StartBits,
     int64_t Val = Value(Insn[i]);
     bool Filtered = PositionFiltered(i);
     switch (State) {
-    default: llvm_unreachable("Unreachable code!");
+    default:
+      llvm_unreachable("Unreachable code!");
     case 0:
     case 1:
       if (Filtered || Val == -1)
@@ -1197,8 +1196,7 @@ void FilterChooser::emitDecoder(raw_ostream &OS, unsigned Indentation,
   }
 }
 
-unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders,
-                                        unsigned Opc,
+unsigned FilterChooser::getDecoderIndex(DecoderSet &Decoders, unsigned Opc,
                                         bool &HasCompleteDecoder) const {
   // Build up the predicate string.
   SmallString<256> Decoder;
@@ -1343,7 +1341,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
   const RecordVal *RV = AllInstructions[Opc].EncodingDef->getValue("SoftFail");
   BitsInit *SFBits = RV ? dyn_cast<BitsInit>(RV->getValue()) : nullptr;
 
-  if (!SFBits) return;
+  if (!SFBits)
+    return;
   BitsInit *InstBits =
       AllInstructions[Opc].EncodingDef->getValueAsBitsInit("Inst");
 
@@ -1353,7 +1352,8 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo,
     bit_value_t B = bitFromBits(*SFBits, i);
     bit_value_t IB = bitFromBits(*InstBits, i);
 
-    if (B != BIT_TRUE) continue;
+    if (B != BIT_TRUE)
+      continue;
 
     switch (IB) {
     case BIT_FALSE:
@@ -1458,12 +1458,12 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo,
   // decoder method indicates that additional processing should be done to see
   // if there is any other instruction that also matches the bitpattern and
   // can decode it.
-  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode :
-      MCD::OPC_TryDecode);
+  TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode
+                                               : MCD::OPC_TryDecode);
   NumEncodingsSupported++;
   uint8_t Buffer[16], *p;
   encodeULEB128(Opc.Opcode, Buffer);
-  for (p = Buffer; *p >= 128 ; ++p)
+  for (p = Buffer; *p >= 128; ++p)
     TableInfo.Table.push_back(*p);
   TableInfo.Table.push_back(*p);
 
@@ -1825,8 +1825,8 @@ static std::string findOperandDecoderMethod(Record *Record) {
   std::string Decoder;
 
   RecordVal *DecoderString = Record->getValue("DecoderMethod");
-  StringInit *String = DecoderString ?
-    dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
+  StringInit *String =
+      DecoderString ? dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
   if (String) {
     Decoder = std::string(String->getValue());
     if (!Decoder.empty())
@@ -1840,7 +1840,7 @@ static std::string findOperandDecoderMethod(Record *Record) {
     Decoder = "Decode" + Record->getName().str() + "RegisterClass";
   } else if (Record->isSubClassOf("PointerLikeRegClass")) {
     Decoder = "DecodePointerLikeRegClass" +
-      utostr(Record->getValueAsInt("RegClassKind"));
+              utostr(Record->getValueAsInt("RegClassKind"));
   }
 
   return Decoder;
@@ -1986,7 +1986,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
   // of trying to auto-generate the decoder.
   StringRef InstDecoder = EncodingDef.getValueAsString("DecoderMethod");
   if (InstDecoder != "") {
-    bool HasCompleteInstDecoder = EncodingDef.getValueAsBit("hasCompleteDecoder");
+    bool HasCompleteInstDecoder =
+        EncodingDef.getValueAsBit("hasCompleteDecoder");
     InsnOperands.push_back(
         OperandInfo(std::string(InstDecoder), HasCompleteInstDecoder));
     Operands[Opc] = InsnOperands;
@@ -2000,9 +2001,9 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
   // Gather the outputs/inputs of the instruction, so we can find their
   // positions in the encoding.  This assumes for now that they appear in the
   // MCInst in the order that they're listed.
-  std::vector<std::pair<Init*, StringRef>> InOutOperands;
-  DagInit *Out  = Def.getValueAsDag("OutOperandList");
-  DagInit *In  = Def.getValueAsDag("InOperandList");
+  std::vector<std::pair<Init *, StringRef>> InOutOperands;
+  DagInit *Out = Def.getValueAsDag("OutOperandList");
+  DagInit *In = Def.getValueAsDag("InOperandList");
   for (unsigned i = 0; i < Out->getNumArgs(); ++i)
     InOutOperands.push_back(
         std::make_pair(Out->getArg(i), Out->getArgNameStr(i)));
@@ -2042,7 +2043,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
       Init *OpInit = Op.first;
       StringRef OpName = Op.second;
 
-      // We're ready to find the instruction encoding locations for this operand.
+      // We're ready to find the instruction encoding locations for this
+      // operand.
 
       // First, find the operand type ("OpInit"), and sub-op names
       // ("SubArgDag") if present.
@@ -2056,7 +2058,8 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
                             ? OpTypeRec->getValueAsDag("MIOperandInfo")
                             : nullptr;
 
-      // Lookup the decoder method and construct a new OperandInfo to hold our result.
+      // Lookup the decoder method and construct a new OperandInfo to hold our
+      // result.
       OperandInfo OpInfo = getOpInfo(OpTypeRec);
 
       // If we have named sub-operands...
@@ -2490,7 +2493,8 @@ void DecoderEmitter::run(raw_ostream &o) {
       NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
                                      NumberedInstruction, HwModeName);
   }
-  for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding"))
+  for (const auto &NumberedAlias :
+       RK.getAllDerivedDefinitions("AdditionalEncoding"))
     NumberedEncodings.emplace_back(
         NumberedAlias,
         &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf")));
@@ -2551,8 +2555,8 @@ void DecoderEmitter::run(raw_ostream &o) {
   DecoderTableInfo TableInfo;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
-    ArrayRef<EncodingAndInst> NumberedEncodingsRef(
-        NumberedEncodings.data(), NumberedEncodings.size());
+    ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
+                                                   NumberedEncodings.size());
     FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
                      IsVarLenInst ? MaxInstLen : 8 * Opc.first.second, this);
 
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
index 92f3721507e5..ae6a8ef22bc8 100644
--- a/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -102,8 +102,8 @@ static void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
   if (Target.getName() == "X86") {
     DisassemblerTables Tables;
 
-    ArrayRef<const CodeGenInstruction*> numberedInstructions =
-      Target.getInstructionsByEnumValue();
+    ArrayRef<const CodeGenInstruction *> numberedInstructions =
+        Target.getInstructionsByEnumValue();
 
     for (unsigned i = 0, e = numberedInstructions.size(); i != e; ++i)
       RecognizableInstr::processInstr(Tables, *numberedInstructions[i], i);
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index b773a6b91ee2..00a165030d36 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -1,4 +1,4 @@
-///===- FastISelEmitter.cpp - Generate an instruction selector -------------===//
+///===- FastISelEmitter.cpp - Generate an instruction selector ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,6 @@
 #include <utility>
 using namespace llvm;
 
-
 /// InstructionMemo - This class holds additional information about an
 /// instruction needed to emit code for it.
 ///
@@ -61,15 +60,15 @@ namespace {
 class ImmPredicateSet {
   DenseMap<TreePattern *, unsigned> ImmIDs;
   std::vector<TreePredicateFn> PredsByName;
-public:
 
+public:
   unsigned getIDFor(TreePredicateFn Pred) {
     unsigned &Entry = ImmIDs[Pred.getOrigPatFragRecord()];
     if (Entry == 0) {
       PredsByName.push_back(Pred);
       Entry = PredsByName.size();
     }
-    return Entry-1;
+    return Entry - 1;
   }
 
   const TreePredicateFn &getPredicate(unsigned i) {
@@ -80,7 +79,6 @@ public:
   typedef std::vector<TreePredicateFn>::const_iterator iterator;
   iterator begin() const { return PredsByName.begin(); }
   iterator end() const { return PredsByName.end(); }
-
 };
 } // End anonymous namespace
 
@@ -92,26 +90,39 @@ struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
     char Repr;
-  public:
 
+  public:
     OpKind() : Repr(OK_Invalid) {}
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
 
-    static OpKind getReg() { OpKind K; K.Repr = OK_Reg; return K; }
-    static OpKind getFP()  { OpKind K; K.Repr = OK_FP; return K; }
+    static OpKind getReg() {
+      OpKind K;
+      K.Repr = OK_Reg;
+      return K;
+    }
+    static OpKind getFP() {
+      OpKind K;
+      K.Repr = OK_FP;
+      return K;
+    }
     static OpKind getImm(unsigned V) {
-      assert((unsigned)OK_Imm+V < 128 &&
+      assert((unsigned)OK_Imm + V < 128 &&
              "Too many integer predicates for the 'Repr' char");
-      OpKind K; K.Repr = OK_Imm+V; return K;
+      OpKind K;
+      K.Repr = OK_Imm + V;
+      return K;
     }
 
     bool isReg() const { return Repr == OK_Reg; }
-    bool isFP() const  { return Repr == OK_FP; }
+    bool isFP() const { return Repr == OK_FP; }
     bool isImm() const { return Repr >= OK_Imm; }
 
-    unsigned getImmCode() const { assert(isImm()); return Repr-OK_Imm; }
+    unsigned getImmCode() const {
+      assert(isImm());
+      return Repr - OK_Imm;
+    }
 
     void printManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates,
                              bool StripImmCodes) const {
@@ -123,12 +134,11 @@ struct OperandsSignature {
         OS << 'i';
         if (!StripImmCodes)
           if (unsigned Code = getImmCode())
-            OS << "_" << ImmPredicates.getPredicate(Code-1).getFnName();
+            OS << "_" << ImmPredicates.getPredicate(Code - 1).getFnName();
       }
     }
   };
 
-
   SmallVector<OpKind, 3> Operands;
 
   bool operator<(const OperandsSignature &O) const {
@@ -162,15 +172,17 @@ struct OperandsSignature {
   void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) {
     bool EmittedAnything = false;
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-      if (!Operands[i].isImm()) continue;
+      if (!Operands[i].isImm())
+        continue;
 
       unsigned Code = Operands[i].getImmCode();
-      if (Code == 0) continue;
+      if (Code == 0)
+        continue;
 
       if (EmittedAnything)
         OS << " &&\n        ";
 
-      TreePredicateFn PredFn = ImmPredicates.getPredicate(Code-1);
+      TreePredicateFn PredFn = ImmPredicates.getPredicate(Code - 1);
 
       // Emit the type check.
       TreePattern *TP = PredFn.getOrigPatFragRecord();
@@ -179,7 +191,7 @@ struct OperandsSignature {
              "Cannot use variable value types with fast isel");
       OS << "VT == " << getEnumName(VVT.getSimple().SimpleTy) << " && ";
 
-      OS << PredFn.getFnName() << "(imm" << i <<')';
+      OS << PredFn.getFnName() << "(imm" << i << ')';
       EmittedAnything = true;
     }
   }
@@ -188,37 +200,36 @@ struct OperandsSignature {
   /// of the Operands array accordingly. Return true if all the operands
   /// are supported, false otherwise.
   ///
-  bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target,
-                  MVT::SimpleValueType VT,
-                  ImmPredicateSet &ImmediatePredicates,
+  bool initialize(TreePatternNode &InstPatNode, const CodeGenTarget &Target,
+                  MVT::SimpleValueType VT, ImmPredicateSet &ImmediatePredicates,
                   const CodeGenRegisterClass *OrigDstRC) {
-    if (InstPatNode->isLeaf())
+    if (InstPatNode.isLeaf())
       return false;
 
-    if (InstPatNode->getOperator()->getName() == "imm") {
+    if (InstPatNode.getOperator()->getName() == "imm") {
       Operands.push_back(OpKind::getImm(0));
       return true;
     }
 
-    if (InstPatNode->getOperator()->getName() == "fpimm") {
+    if (InstPatNode.getOperator()->getName() == "fpimm") {
       Operands.push_back(OpKind::getFP());
       return true;
     }
 
     const CodeGenRegisterClass *DstRC = nullptr;
 
-    for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
-      TreePatternNode *Op = InstPatNode->getChild(i);
+    for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) {
+      TreePatternNode &Op = InstPatNode.getChild(i);
 
       // Handle imm operands specially.
-      if (!Op->isLeaf() && Op->getOperator()->getName() == "imm") {
+      if (!Op.isLeaf() && Op.getOperator()->getName() == "imm") {
         unsigned PredNo = 0;
-        if (!Op->getPredicateCalls().empty()) {
-          TreePredicateFn PredFn = Op->getPredicateCalls()[0].Fn;
+        if (!Op.getPredicateCalls().empty()) {
+          TreePredicateFn PredFn = Op.getPredicateCalls()[0].Fn;
           // If there is more than one predicate weighing in on this operand
           // then we don't handle it.  This doesn't typically happen for
           // immediates anyway.
-          if (Op->getPredicateCalls().size() > 1 ||
+          if (Op.getPredicateCalls().size() > 1 ||
               !PredFn.isImmediatePattern() || PredFn.usesOperands())
             return false;
           // Ignore any instruction with 'FastIselShouldIgnore', these are
@@ -229,21 +240,20 @@ struct OperandsSignature {
           if (Rec->getValueAsBit("FastIselShouldIgnore"))
             return false;
 
-          PredNo = ImmediatePredicates.getIDFor(PredFn)+1;
+          PredNo = ImmediatePredicates.getIDFor(PredFn) + 1;
         }
 
         Operands.push_back(OpKind::getImm(PredNo));
         continue;
       }
 
-
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
-      if (!Op->getPredicateCalls().empty() || Op->getNumTypes() != 1)
+      if (!Op.getPredicateCalls().empty() || Op.getNumTypes() != 1)
         return false;
 
-      if (!Op->isLeaf()) {
-         if (Op->getOperator()->getName() == "fpimm") {
+      if (!Op.isLeaf()) {
+        if (Op.getOperator()->getName() == "fpimm") {
           Operands.push_back(OpKind::getFP());
           continue;
         }
@@ -251,15 +261,15 @@ struct OperandsSignature {
         return false;
       }
 
-      assert(Op->hasConcreteType(0) && "Type infererence not done?");
+      assert(Op.hasConcreteType(0) && "Type infererence not done?");
 
       // For now, all the operands must have the same type (if they aren't
       // immediates).  Note that this causes us to reject variable sized shifts
       // on X86.
-      if (Op->getSimpleType(0) != VT)
+      if (Op.getSimpleType(0) != VT)
         return false;
 
-      DefInit *OpDI = dyn_cast<DefInit>(Op->getLeafValue());
+      DefInit *OpDI = dyn_cast<DefInit>(Op.getLeafValue());
       if (!OpDI)
         return false;
       Record *OpLeafRec = OpDI->getDef();
@@ -347,7 +357,6 @@ struct OperandsSignature {
     }
   }
 
-
   void PrintManglingSuffix(raw_ostream &OS, const std::vector<std::string> &PR,
                            ImmPredicateSet &ImmPredicates,
                            bool StripImmCodes = false) const {
@@ -380,7 +389,7 @@ class FastISelMap {
   typedef std::map<MVT::SimpleValueType, RetPredMap> TypeRetPredMap;
   typedef std::map<std::string, TypeRetPredMap> OpcodeTypeRetPredMap;
   typedef std::map<OperandsSignature, OpcodeTypeRetPredMap>
-            OperandsOpcodeTypeRetPredMap;
+      OperandsOpcodeTypeRetPredMap;
 
   OperandsOpcodeTypeRetPredMap SimplePatterns;
 
@@ -389,22 +398,22 @@ class FastISelMap {
                       MVT::SimpleValueType, std::string>>
       SimplePatternsCheck;
 
-  std::map<OperandsSignature, std::vector<OperandsSignature> >
-    SignaturesWithConstantForms;
+  std::map<OperandsSignature, std::vector<OperandsSignature>>
+      SignaturesWithConstantForms;
 
   StringRef InstNS;
   ImmPredicateSet ImmediatePredicates;
+
 public:
   explicit FastISelMap(StringRef InstNS);
 
   void collectPatterns(CodeGenDAGPatterns &CGP);
   void printImmediatePredicates(raw_ostream &OS);
   void printFunctionDefinitions(raw_ostream &OS);
+
 private:
-  void emitInstructionCode(raw_ostream &OS,
-                           const OperandsSignature &Operands,
-                           const PredMap &PM,
-                           const std::string &RetVTName);
+  void emitInstructionCode(raw_ostream &OS, const OperandsSignature &Operands,
+                           const PredMap &PM, const std::string &RetVTName);
 };
 } // End anonymous namespace
 
@@ -421,19 +430,19 @@ static std::string getLegalCName(std::string OpName) {
 
 FastISelMap::FastISelMap(StringRef instns) : InstNS(instns) {}
 
-static std::string PhyRegForNode(TreePatternNode *Op,
+static std::string PhyRegForNode(TreePatternNode &Op,
                                  const CodeGenTarget &Target) {
   std::string PhysReg;
 
-  if (!Op->isLeaf())
+  if (!Op.isLeaf())
     return PhysReg;
 
-  Record *OpLeafRec = cast<DefInit>(Op->getLeafValue())->getDef();
+  Record *OpLeafRec = cast<DefInit>(Op.getLeafValue())->getDef();
   if (!OpLeafRec->isSubClassOf("Register"))
     return PhysReg;
 
   PhysReg += cast<StringInit>(OpLeafRec->getValue("Namespace")->getValue())
-               ->getValue();
+                 ->getValue();
   PhysReg += "::";
   PhysReg += Target.getRegBank().getReg(OpLeafRec)->getName();
   return PhysReg;
@@ -443,15 +452,16 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
   const CodeGenTarget &Target = CGP.getTargetInfo();
 
   // Scan through all the patterns and record the simple ones.
-  for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(),
-       E = CGP.ptm_end(); I != E; ++I) {
+  for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), E = CGP.ptm_end();
+       I != E; ++I) {
     const PatternToMatch &Pattern = *I;
 
     // For now, just look at Instructions, so that we don't have to worry
     // about emitting multiple instructions for a pattern.
-    TreePatternNode *Dst = Pattern.getDstPattern();
-    if (Dst->isLeaf()) continue;
-    Record *Op = Dst->getOperator();
+    TreePatternNode &Dst = Pattern.getDstPattern();
+    if (Dst.isLeaf())
+      continue;
+    Record *Op = Dst.getOperator();
     if (!Op->isSubClassOf("Instruction"))
       continue;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -467,11 +477,11 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
     // For now, ignore multi-instruction patterns.
     bool MultiInsts = false;
-    for (unsigned i = 0, e = Dst->getNumChildren(); i != e; ++i) {
-      TreePatternNode *ChildOp = Dst->getChild(i);
-      if (ChildOp->isLeaf())
+    for (unsigned i = 0, e = Dst.getNumChildren(); i != e; ++i) {
+      TreePatternNode &ChildOp = Dst.getChild(i);
+      if (ChildOp.isLeaf())
         continue;
-      if (ChildOp->getOperator()->isSubClassOf("Instruction")) {
+      if (ChildOp.getOperator()->isSubClassOf("Instruction")) {
         MultiInsts = true;
         break;
       }
@@ -495,35 +505,38 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     } else {
       // If this isn't a leaf, then continue since the register classes are
       // a bit too complicated for now.
-      if (!Dst->getChild(1)->isLeaf()) continue;
+      if (!Dst.getChild(1).isLeaf())
+        continue;
 
-      DefInit *SR = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
+      DefInit *SR = dyn_cast<DefInit>(Dst.getChild(1).getLeafValue());
       if (SR)
         SubRegNo = getQualifiedName(SR->getDef());
       else
-        SubRegNo = Dst->getChild(1)->getLeafValue()->getAsString();
+        SubRegNo = Dst.getChild(1).getLeafValue()->getAsString();
     }
 
     // Inspect the pattern.
-    TreePatternNode *InstPatNode = Pattern.getSrcPattern();
-    if (!InstPatNode) continue;
-    if (InstPatNode->isLeaf()) continue;
+    TreePatternNode &InstPatNode = Pattern.getSrcPattern();
+    if (InstPatNode.isLeaf())
+      continue;
 
     // Ignore multiple result nodes for now.
-    if (InstPatNode->getNumTypes() > 1) continue;
+    if (InstPatNode.getNumTypes() > 1)
+      continue;
 
-    Record *InstPatOp = InstPatNode->getOperator();
+    Record *InstPatOp = InstPatNode.getOperator();
     std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
     MVT::SimpleValueType RetVT = MVT::isVoid;
-    if (InstPatNode->getNumTypes()) RetVT = InstPatNode->getSimpleType(0);
+    if (InstPatNode.getNumTypes())
+      RetVT = InstPatNode.getSimpleType(0);
     MVT::SimpleValueType VT = RetVT;
-    if (InstPatNode->getNumChildren()) {
-      assert(InstPatNode->getChild(0)->getNumTypes() == 1);
-      VT = InstPatNode->getChild(0)->getSimpleType(0);
+    if (InstPatNode.getNumChildren()) {
+      assert(InstPatNode.getChild(0).getNumTypes() == 1);
+      VT = InstPatNode.getChild(0).getSimpleType(0);
     }
 
     // For now, filter out any instructions with predicates.
-    if (!InstPatNode->getPredicateCalls().empty())
+    if (!InstPatNode.getPredicateCalls().empty())
       continue;
 
     // Check all the operands.
@@ -533,20 +546,20 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
       continue;
 
     std::vector<std::string> PhysRegInputs;
-    if (InstPatNode->getOperator()->getName() == "imm" ||
-        InstPatNode->getOperator()->getName() == "fpimm")
+    if (InstPatNode.getOperator()->getName() == "imm" ||
+        InstPatNode.getOperator()->getName() == "fpimm")
       PhysRegInputs.push_back("");
     else {
       // Compute the PhysRegs used by the given pattern, and check that
       // the mapping from the src to dst patterns is simple.
       bool FoundNonSimplePattern = false;
       unsigned DstIndex = 0;
-      for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
-        std::string PhysReg = PhyRegForNode(InstPatNode->getChild(i), Target);
+      for (unsigned i = 0, e = InstPatNode.getNumChildren(); i != e; ++i) {
+        std::string PhysReg = PhyRegForNode(InstPatNode.getChild(i), Target);
         if (PhysReg.empty()) {
-          if (DstIndex >= Dst->getNumChildren() ||
-              Dst->getChild(DstIndex)->getName() !=
-              InstPatNode->getChild(i)->getName()) {
+          if (DstIndex >= Dst.getNumChildren() ||
+              Dst.getChild(DstIndex).getName() !=
+                  InstPatNode.getChild(i).getName()) {
             FoundNonSimplePattern = true;
             break;
           }
@@ -556,7 +569,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
         PhysRegInputs.push_back(PhysReg);
       }
 
-      if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst->getNumChildren())
+      if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst.getNumChildren())
         FoundNonSimplePattern = true;
 
       if (FoundNonSimplePattern)
@@ -568,21 +581,16 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     raw_string_ostream SuffixOS(ManglingSuffix);
     Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true);
     if (!StringSwitch<bool>(ManglingSuffix)
-        .Cases("", "r", "rr", "ri", "i", "f", true)
-        .Default(false))
+             .Cases("", "r", "rr", "ri", "i", "f", true)
+             .Default(false))
       continue;
 
     // Get the predicate that guards this pattern.
     std::string PredicateCheck = Pattern.getPredicateCheck();
 
     // Ok, we found a pattern that we can handle. Remember it.
-    InstructionMemo Memo(
-      Pattern.getDstPattern()->getOperator()->getName(),
-      DstRC,
-      SubRegNo,
-      PhysRegInputs,
-      PredicateCheck
-    );
+    InstructionMemo Memo(Pattern.getDstPattern().getOperator()->getName(),
+                         DstRC, SubRegNo, PhysRegInputs, PredicateCheck);
 
     int complexity = Pattern.getPatternComplexity(CGP);
 
@@ -590,7 +598,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
         std::make_tuple(Operands, OpcodeName, VT, RetVT, PredicateCheck));
     if (!inserted_simple_pattern.second) {
       PrintFatalError(Pattern.getSrcRecord()->getLoc(),
-                    "Duplicate predicate in FastISel table!");
+                      "Duplicate predicate in FastISel table!");
     }
 
     // Note: Instructions with the same complexity will appear in the order
@@ -602,8 +610,8 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     // them down to a signature that doesn't have predicates so that we can
     // associate them with the stripped predicate version.
     if (Operands.hasAnyImmediateCodes()) {
-      SignaturesWithConstantForms[Operands.getWithoutImmCodes()]
-        .push_back(Operands);
+      SignaturesWithConstantForms[Operands.getWithoutImmCodes()].push_back(
+          Operands);
     }
   }
 }
@@ -645,7 +653,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
       if (OneHadNoPredicate) {
         PrintFatalError("Multiple instructions match and one with no "
                         "predicate came before one with a predicate!  "
-                        "name:" + Memo.Name + "  predicate: " + PredicateCheck);
+                        "name:" +
+                        Memo.Name + "  predicate: " + PredicateCheck);
       }
       OS << "  if (" + PredicateCheck + ") {\n";
       OS << "  ";
@@ -669,8 +678,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
       Operands.PrintArguments(OS, Memo.PhysRegs);
       OS << ");\n";
     } else {
-      OS << "extractsubreg(" << RetVTName
-         << ", Op0, " << Memo.SubRegNo << ");\n";
+      OS << "extractsubreg(" << RetVTName << ", Op0, " << Memo.SubRegNo
+         << ");\n";
     }
 
     if (!PredicateCheck.empty()) {
@@ -685,7 +694,6 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS,
   OS << "\n";
 }
 
-
 void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
   // Now emit code for all the patterns that we collected.
   for (const auto &SimplePattern : SimplePatterns) {
@@ -762,8 +770,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
       }
 
       // Emit one function for the opcode that demultiplexes based on the type.
-      OS << "unsigned fastEmit_"
-         << getLegalCName(Opcode) << "_";
+      OS << "unsigned fastEmit_" << getLegalCName(Opcode) << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(MVT VT, MVT RetVT";
       if (!Operands.empty())
@@ -809,8 +816,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
     // constrained forms of the immediate (e.g., 32-bit sext immediate in a
     // 64-bit operand), check them first.
 
-    std::map<OperandsSignature, std::vector<OperandsSignature> >::iterator MI
-      = SignaturesWithConstantForms.find(Operands);
+    std::map<OperandsSignature, std::vector<OperandsSignature>>::iterator MI =
+        SignaturesWithConstantForms.find(Operands);
     if (MI != SignaturesWithConstantForms.end()) {
       // Unique any duplicates out of the list.
       llvm::sort(MI->second);
@@ -840,8 +847,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
     for (const auto &I : OTM) {
       const std::string &Opcode = I.first;
 
-      OS << "  case " << Opcode << ": return fastEmit_"
-         << getLegalCName(Opcode) << "_";
+      OS << "  case " << Opcode << ": return fastEmit_" << getLegalCName(Opcode)
+         << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(VT, RetVT";
       if (!Operands.empty())
@@ -862,7 +869,8 @@ static void EmitFastISel(RecordKeeper &RK, raw_ostream &OS) {
   CodeGenDAGPatterns CGP(RK);
   const CodeGenTarget &Target = CGP.getTargetInfo();
   emitSourceFileHeader("\"Fast\" Instruction Selector for the " +
-                       Target.getName().str() + " target", OS);
+                           Target.getName().str() + " target",
+                       OS);
 
   // Determine the target's namespace name.
   StringRef InstNS = Target.getInstNamespace();
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 22e7785275bd..13f2384efc9c 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -90,10 +90,10 @@ static cl::opt<bool> OptimizeMatchTable(
 
 namespace {
 
-static std::string explainPredicates(const TreePatternNode *N) {
+static std::string explainPredicates(const TreePatternNode &N) {
   std::string Explanation;
   StringRef Separator = "";
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &P = Call.Fn;
     Explanation +=
         (Separator + P.getOrigPatFragRecord()->getRecord()->getName()).str();
@@ -194,12 +194,12 @@ static Error failedImport(const Twine &Reason) {
   return make_error<StringError>(Reason, inconvertibleErrorCode());
 }
 
-static Error isTrivialOperatorNode(const TreePatternNode *N) {
+static Error isTrivialOperatorNode(const TreePatternNode &N) {
   std::string Explanation;
   std::string Separator;
 
   bool HasUnsupportedPredicate = false;
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
 
     if (Predicate.isAlwaysTrue())
@@ -288,8 +288,8 @@ static std::string getMangledRootDefName(StringRef DefOperandName) {
 
 //===- GlobalISelEmitter class --------------------------------------------===//
 
-static Expected<LLTCodeGen> getInstResultType(const TreePatternNode *Dst) {
-  ArrayRef<TypeSetByHwMode> ChildTypes = Dst->getExtTypes();
+static Expected<LLTCodeGen> getInstResultType(const TreePatternNode &Dst) {
+  ArrayRef<TypeSetByHwMode> ChildTypes = Dst.getExtTypes();
   if (ChildTypes.size() != 1)
     return failedImport("Dst pattern child has multiple results");
 
@@ -372,40 +372,40 @@ private:
 
   Record *findNodeEquiv(Record *N) const;
   const CodeGenInstruction *getEquivNode(Record &Equiv,
-                                         const TreePatternNode *N) const;
+                                         const TreePatternNode &N) const;
 
   Error importRulePredicates(RuleMatcher &M, ArrayRef<Record *> Predicates);
   Expected<InstructionMatcher &>
   createAndImportSelDAGMatcher(RuleMatcher &Rule,
                                InstructionMatcher &InsnMatcher,
-                               const TreePatternNode *Src, unsigned &TempOpIdx);
+                               const TreePatternNode &Src, unsigned &TempOpIdx);
   Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R,
                                            unsigned &TempOpIdx) const;
   Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-                           const TreePatternNode *SrcChild,
+                           const TreePatternNode &SrcChild,
                            bool OperandIsAPointer, bool OperandIsImmArg,
                            unsigned OpIdx, unsigned &TempOpIdx);
 
   Expected<BuildMIAction &> createAndImportInstructionRenderer(
       RuleMatcher &M, InstructionMatcher &InsnMatcher,
-      const TreePatternNode *Src, const TreePatternNode *Dst);
+      const TreePatternNode &Src, const TreePatternNode &Dst);
   Expected<action_iterator> createAndImportSubInstructionRenderer(
-      action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
-      const TreePatternNode *Src, unsigned TempReg);
+      action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst,
+      const TreePatternNode &Src, unsigned TempReg);
   Expected<action_iterator>
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
-                            const TreePatternNode *Dst);
+                            const TreePatternNode &Dst);
 
   Expected<action_iterator> importExplicitDefRenderers(
       action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const TreePatternNode *Src, const TreePatternNode *Dst);
+      const TreePatternNode &Src, const TreePatternNode &Dst);
 
   Expected<action_iterator> importExplicitUseRenderers(
       action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const llvm::TreePatternNode *Dst, const TreePatternNode *Src);
+      const llvm::TreePatternNode &Dst, const TreePatternNode &Src);
   Expected<action_iterator> importExplicitUseRenderer(
       action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
-      const TreePatternNode *DstChild, const TreePatternNode *Src);
+      const TreePatternNode &DstChild, const TreePatternNode &Src);
   Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M,
                                       BuildMIAction &DstMIBuilder,
                                       const DAGDefaultOperand &DefaultOp) const;
@@ -430,25 +430,25 @@ private:
   /// If no register class is found, return std::nullopt.
   std::optional<const CodeGenRegisterClass *>
   inferSuperRegisterClassForNode(const TypeSetByHwMode &Ty,
-                                 const TreePatternNode *SuperRegNode,
-                                 const TreePatternNode *SubRegIdxNode);
+                                 const TreePatternNode &SuperRegNode,
+                                 const TreePatternNode &SubRegIdxNode);
   std::optional<CodeGenSubRegIndex *>
-  inferSubRegIndexForNode(const TreePatternNode *SubRegIdxNode);
+  inferSubRegIndexForNode(const TreePatternNode &SubRegIdxNode);
 
   /// Infer a CodeGenRegisterClass which suppoorts \p Ty and \p SubRegIdxNode.
   /// Return std::nullopt if no such class exists.
   std::optional<const CodeGenRegisterClass *>
   inferSuperRegisterClass(const TypeSetByHwMode &Ty,
-                          const TreePatternNode *SubRegIdxNode);
+                          const TreePatternNode &SubRegIdxNode);
 
   /// Return the CodeGenRegisterClass associated with \p Leaf if it has one.
   std::optional<const CodeGenRegisterClass *>
-  getRegClassFromLeaf(const TreePatternNode *Leaf);
+  getRegClassFromLeaf(const TreePatternNode &Leaf);
 
   /// Return a CodeGenRegisterClass for \p N if one can be found. Return
   /// std::nullopt otherwise.
   std::optional<const CodeGenRegisterClass *>
-  inferRegClassFromPattern(const TreePatternNode *N);
+  inferRegClassFromPattern(const TreePatternNode &N);
 
   /// Return the size of the MemoryVT in this predicate, if possible.
   std::optional<unsigned>
@@ -498,19 +498,19 @@ Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
 }
 
 const CodeGenInstruction *
-GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
-  if (N->getNumChildren() >= 1) {
+GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode &N) const {
+  if (N.getNumChildren() >= 1) {
     // setcc operation maps to two different G_* instructions based on the type.
     if (!Equiv.isValueUnset("IfFloatingPoint") &&
-        MVT(N->getChild(0)->getSimpleType(0)).isFloatingPoint())
+        MVT(N.getChild(0).getSimpleType(0)).isFloatingPoint())
       return &Target.getInstruction(Equiv.getValueAsDef("IfFloatingPoint"));
   }
 
   if (!Equiv.isValueUnset("IfConvergent") &&
-      N->getIntrinsicInfo(CGP)->isConvergent)
+      N.getIntrinsicInfo(CGP)->isConvergent)
     return &Target.getInstruction(Equiv.getValueAsDef("IfConvergent"));
 
-  for (const TreePredicateCall &Call : N->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : N.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
     if (!Equiv.isValueUnset("IfSignExtend") &&
         (Predicate.isLoad() || Predicate.isAtomic()) &&
@@ -707,15 +707,15 @@ Expected<InstructionMatcher &> GlobalISelEmitter::addBuiltinPredicates(
 
 Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-    const TreePatternNode *Src, unsigned &TempOpIdx) {
-  const auto SavedFlags = Rule.setGISelFlags(Src->getGISelFlagsRecord());
+    const TreePatternNode &Src, unsigned &TempOpIdx) {
+  const auto SavedFlags = Rule.setGISelFlags(Src.getGISelFlagsRecord());
 
   Record *SrcGIEquivOrNull = nullptr;
   const CodeGenInstruction *SrcGIOrNull = nullptr;
 
   // Start with the defined operands (i.e., the results of the root operator).
-  if (Src->isLeaf()) {
-    Init *SrcInit = Src->getLeafValue();
+  if (Src.isLeaf()) {
+    Init *SrcInit = Src.getLeafValue();
     if (isa<IntInit>(SrcInit)) {
       InsnMatcher.addPredicate<InstructionOpcodeMatcher>(
           &Target.getInstruction(RK.getDef("G_CONSTANT")));
@@ -723,10 +723,10 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return failedImport(
           "Unable to deduce gMIR opcode to handle Src (which is a leaf)");
   } else {
-    SrcGIEquivOrNull = findNodeEquiv(Src->getOperator());
+    SrcGIEquivOrNull = findNodeEquiv(Src.getOperator());
     if (!SrcGIEquivOrNull)
       return failedImport("Pattern operator lacks an equivalent Instruction" +
-                          explainOperator(Src->getOperator()));
+                          explainOperator(Src.getOperator()));
     SrcGIOrNull = getEquivNode(*SrcGIEquivOrNull, Src);
 
     // The operators look good: match the opcode
@@ -734,7 +734,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
   }
 
   unsigned OpIdx = 0;
-  for (const TypeSetByHwMode &VTy : Src->getExtTypes()) {
+  for (const TypeSetByHwMode &VTy : Src.getExtTypes()) {
     // Results don't have a name unless they are the root node. The caller will
     // set the name if appropriate.
     const bool OperandIsAPointer =
@@ -745,7 +745,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
                           " for result of Src pattern operator");
   }
 
-  for (const TreePredicateCall &Call : Src->getPredicateCalls()) {
+  for (const TreePredicateCall &Call : Src.getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
     bool HasAddedBuiltinMatcher = true;
     if (Predicate.isAlwaysTrue())
@@ -800,11 +800,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
   }
 
-  if (Src->isLeaf()) {
-    Init *SrcInit = Src->getLeafValue();
+  if (Src.isLeaf()) {
+    Init *SrcInit = Src.getLeafValue();
     if (IntInit *SrcIntInit = dyn_cast<IntInit>(SrcInit)) {
       OperandMatcher &OM =
-          InsnMatcher.addOperand(OpIdx++, Src->getName(), TempOpIdx);
+          InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx);
       OM.addPredicate<LiteralIntOperandMatcher>(SrcIntInit->getValue());
     } else
       return failedImport(
@@ -825,19 +825,19 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     // predicate operand needs to be swapped from the last operand to the first
     // source.
 
-    unsigned NumChildren = Src->getNumChildren();
+    unsigned NumChildren = Src.getNumChildren();
     bool IsFCmp = SrcGIOrNull->TheDef->getName() == "G_FCMP";
 
     if (IsFCmp || SrcGIOrNull->TheDef->getName() == "G_ICMP") {
-      const TreePatternNode *SrcChild = Src->getChild(NumChildren - 1);
-      if (SrcChild->isLeaf()) {
-        DefInit *DI = dyn_cast<DefInit>(SrcChild->getLeafValue());
+      const TreePatternNode &SrcChild = Src.getChild(NumChildren - 1);
+      if (SrcChild.isLeaf()) {
+        DefInit *DI = dyn_cast<DefInit>(SrcChild.getLeafValue());
         Record *CCDef = DI ? DI->getDef() : nullptr;
         if (!CCDef || !CCDef->isSubClassOf("CondCode"))
           return failedImport("Unable to handle CondCode");
 
         OperandMatcher &OM =
-            InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
+            InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
         StringRef PredType = IsFCmp ? CCDef->getValueAsString("FCmpPredicate")
                                     : CCDef->getValueAsString("ICmpPredicate");
 
@@ -856,12 +856,12 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_CONVERGENT" ||
         SrcGIOrNull->TheDef->getName() ==
             "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS";
-    const CodeGenIntrinsic *II = Src->getIntrinsicInfo(CGP);
+    const CodeGenIntrinsic *II = Src.getIntrinsicInfo(CGP);
     if (IsIntrinsic && !II)
       return failedImport("Expected IntInit containing intrinsic ID)");
 
     for (unsigned i = 0; i != NumChildren; ++i) {
-      const TreePatternNode *SrcChild = Src->getChild(i);
+      const TreePatternNode &SrcChild = Src.getChild(i);
 
       // We need to determine the meaning of a literal integer based on the
       // context. If this is a field required to be an immediate (such as an
@@ -884,7 +884,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         // following the defs is an intrinsic ID.
         if (i == 0) {
           OperandMatcher &OM =
-              InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
+              InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
           OM.addPredicate<IntrinsicIDOperandMatcher>(II);
           continue;
         }
@@ -921,11 +921,11 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher(
 
 // Get the name to use for a pattern operand. For an anonymous physical register
 // input, this should use the register name.
-static StringRef getSrcChildName(const TreePatternNode *SrcChild,
+static StringRef getSrcChildName(const TreePatternNode &SrcChild,
                                  Record *&PhysReg) {
-  StringRef SrcChildName = SrcChild->getName();
-  if (SrcChildName.empty() && SrcChild->isLeaf()) {
-    if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+  StringRef SrcChildName = SrcChild.getName();
+  if (SrcChildName.empty() && SrcChild.isLeaf()) {
+    if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
       auto *ChildRec = ChildDefInit->getDef();
       if (ChildRec->isSubClassOf("Register")) {
         SrcChildName = ChildRec->getName();
@@ -939,19 +939,19 @@ static StringRef getSrcChildName(const TreePatternNode *SrcChild,
 
 Error GlobalISelEmitter::importChildMatcher(
     RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
-    const TreePatternNode *SrcChild, bool OperandIsAPointer,
+    const TreePatternNode &SrcChild, bool OperandIsAPointer,
     bool OperandIsImmArg, unsigned OpIdx, unsigned &TempOpIdx) {
 
   Record *PhysReg = nullptr;
   std::string SrcChildName = std::string(getSrcChildName(SrcChild, PhysReg));
-  if (!SrcChild->isLeaf() &&
-      SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (!SrcChild.isLeaf() &&
+      SrcChild.getOperator()->isSubClassOf("ComplexPattern")) {
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2" and the ones with same "name" represent same operand.
-    std::string PatternName = std::string(SrcChild->getOperator()->getName());
-    for (unsigned i = 0; i < SrcChild->getNumChildren(); ++i) {
+    std::string PatternName = std::string(SrcChild.getOperator()->getName());
+    for (unsigned i = 0; i < SrcChild.getNumChildren(); ++i) {
       PatternName += ":";
-      PatternName += SrcChild->getChild(i)->getName();
+      PatternName += SrcChild.getChild(i).getName();
     }
     SrcChildName = PatternName;
   }
@@ -962,23 +962,23 @@ Error GlobalISelEmitter::importChildMatcher(
   if (OM.isSameAsAnotherOperand())
     return Error::success();
 
-  ArrayRef<TypeSetByHwMode> ChildTypes = SrcChild->getExtTypes();
+  ArrayRef<TypeSetByHwMode> ChildTypes = SrcChild.getExtTypes();
   if (ChildTypes.size() != 1)
     return failedImport("Src pattern child has multiple results");
 
   // Check MBB's before the type check since they are not a known type.
-  if (!SrcChild->isLeaf()) {
-    if (SrcChild->getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild->getOperator());
+  if (!SrcChild.isLeaf()) {
+    if (SrcChild.getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild.getOperator());
       if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
         OM.addPredicate<MBBOperandMatcher>();
         return Error::success();
       }
-      if (SrcChild->getOperator()->getName() == "timm") {
+      if (SrcChild.getOperator()->getName() == "timm") {
         OM.addPredicate<ImmOperandMatcher>();
 
         // Add predicates, if any
-        for (const TreePredicateCall &Call : SrcChild->getPredicateCalls()) {
+        for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) {
           const TreePredicateFn &Predicate = Call.Fn;
 
           // Only handle immediate patterns for now
@@ -998,12 +998,12 @@ Error GlobalISelEmitter::importChildMatcher(
     if (auto Error =
             OM.addTypeCheckPredicate(ChildTypes.front(), OperandIsAPointer))
       return failedImport(toString(std::move(Error)) + " for Src operand (" +
-                          to_string(*SrcChild) + ")");
+                          to_string(SrcChild) + ")");
   }
 
   // Try look up SrcChild for a (named) predicate operand if there is any.
   if (WaitingForNamedOperands) {
-    auto &ScopedNames = SrcChild->getNamesAsPredicateArg();
+    auto &ScopedNames = SrcChild.getNamesAsPredicateArg();
     if (!ScopedNames.empty()) {
       auto PA = ScopedNames.begin();
       std::string Name = getScopedName(PA->getScope(), PA->getIdentifier());
@@ -1013,22 +1013,22 @@ Error GlobalISelEmitter::importChildMatcher(
   }
 
   // Check for nested instructions.
-  if (!SrcChild->isLeaf()) {
-    if (SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
+  if (!SrcChild.isLeaf()) {
+    if (SrcChild.getOperator()->isSubClassOf("ComplexPattern")) {
       // When a ComplexPattern is used as an operator, it should do the same
       // thing as when used as a leaf. However, the children of the operator
       // name the sub-operands that make up the complex operand and we must
       // prepare to reference them in the renderer too.
       unsigned RendererID = TempOpIdx;
       if (auto Error = importComplexPatternOperandMatcher(
-              OM, SrcChild->getOperator(), TempOpIdx))
+              OM, SrcChild.getOperator(), TempOpIdx))
         return Error;
 
-      for (unsigned i = 0, e = SrcChild->getNumChildren(); i != e; ++i) {
-        auto *SubOperand = SrcChild->getChild(i);
-        if (!SubOperand->getName().empty()) {
+      for (unsigned i = 0, e = SrcChild.getNumChildren(); i != e; ++i) {
+        auto &SubOperand = SrcChild.getChild(i);
+        if (!SubOperand.getName().empty()) {
           if (auto Error = Rule.defineComplexSubOperand(
-                  SubOperand->getName(), SrcChild->getOperator(), RendererID, i,
+                  SubOperand.getName(), SrcChild.getOperator(), RendererID, i,
                   SrcChildName))
             return Error;
         }
@@ -1038,7 +1038,7 @@ Error GlobalISelEmitter::importChildMatcher(
     }
 
     auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
-        InsnMatcher.getRuleMatcher(), SrcChild->getName());
+        InsnMatcher.getRuleMatcher(), SrcChild.getName());
     if (!MaybeInsnOperand) {
       // This isn't strictly true. If the user were to provide exactly the same
       // matchers as the original operand then we could allow it. However, it's
@@ -1057,11 +1057,11 @@ Error GlobalISelEmitter::importChildMatcher(
     return Error::success();
   }
 
-  if (SrcChild->hasAnyPredicate())
+  if (SrcChild.hasAnyPredicate())
     return failedImport("Src pattern child has unsupported predicate");
 
   // Check for constant immediates.
-  if (auto *ChildInt = dyn_cast<IntInit>(SrcChild->getLeafValue())) {
+  if (auto *ChildInt = dyn_cast<IntInit>(SrcChild.getLeafValue())) {
     if (OperandIsImmArg) {
       // Checks for argument directly in operand list
       OM.addPredicate<LiteralIntOperandMatcher>(ChildInt->getValue());
@@ -1073,7 +1073,7 @@ Error GlobalISelEmitter::importChildMatcher(
   }
 
   // Check for def's like register classes or ComplexPattern's.
-  if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+  if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
     // Check for register classes.
@@ -1121,7 +1121,7 @@ Error GlobalISelEmitter::importChildMatcher(
     const bool ImmAllOnesV = ChildRec->getName() == "immAllOnesV";
     if (ImmAllOnesV || ChildRec->getName() == "immAllZerosV") {
       auto MaybeInsnOperand = OM.addPredicate<InstructionOperandMatcher>(
-          InsnMatcher.getRuleMatcher(), SrcChild->getName(), false);
+          InsnMatcher.getRuleMatcher(), SrcChild.getName(), false);
       InstructionOperandMatcher &InsnOperand = **MaybeInsnOperand;
 
       ValueTypeByHwMode VTy = ChildTypes.front().getValueTypeByHwMode();
@@ -1161,45 +1161,44 @@ Error GlobalISelEmitter::importChildMatcher(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
-    const TreePatternNode *DstChild, const TreePatternNode *Src) {
+    const TreePatternNode &DstChild, const TreePatternNode &Src) {
 
-  const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName());
+  const auto &SubOperand = Rule.getComplexSubOperand(DstChild.getName());
   if (SubOperand) {
     DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-        *std::get<0>(*SubOperand), DstChild->getName(),
-        std::get<1>(*SubOperand), std::get<2>(*SubOperand));
+        *std::get<0>(*SubOperand), DstChild.getName(), std::get<1>(*SubOperand),
+        std::get<2>(*SubOperand));
     return InsertPt;
   }
 
-  if (!DstChild->isLeaf()) {
-    if (DstChild->getOperator()->isSubClassOf("SDNodeXForm")) {
-      auto Child = DstChild->getChild(0);
-      auto I = SDNodeXFormEquivs.find(DstChild->getOperator());
+  if (!DstChild.isLeaf()) {
+    if (DstChild.getOperator()->isSubClassOf("SDNodeXForm")) {
+      auto &Child = DstChild.getChild(0);
+      auto I = SDNodeXFormEquivs.find(DstChild.getOperator());
       if (I != SDNodeXFormEquivs.end()) {
-        Record *XFormOpc = DstChild->getOperator()->getValueAsDef("Opcode");
+        Record *XFormOpc = DstChild.getOperator()->getValueAsDef("Opcode");
         if (XFormOpc->getName() == "timm") {
           // If this is a TargetConstant, there won't be a corresponding
           // instruction to transform. Instead, this will refer directly to an
           // operand in an instruction's operand list.
           DstMIBuilder.addRenderer<CustomOperandRenderer>(*I->second,
-                                                          Child->getName());
+                                                          Child.getName());
         } else {
-          DstMIBuilder.addRenderer<CustomRenderer>(*I->second,
-                                                   Child->getName());
+          DstMIBuilder.addRenderer<CustomRenderer>(*I->second, Child.getName());
         }
 
         return InsertPt;
       }
-      return failedImport("SDNodeXForm " + Child->getName() +
+      return failedImport("SDNodeXForm " + Child.getName() +
                           " has no custom renderer");
     }
 
     // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't
     // inline, but in MI it's just another operand.
-    if (DstChild->getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(DstChild->getOperator());
+    if (DstChild.getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(DstChild.getOperator());
       if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
-        DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+        DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
         return InsertPt;
       }
     }
@@ -1208,19 +1207,19 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     // rendered as operands.
     // FIXME: The target should be able to choose sign-extended when appropriate
     //        (e.g. on Mips).
-    if (DstChild->getOperator()->getName() == "timm") {
-      DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+    if (DstChild.getOperator()->getName() == "timm") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild->getOperator()->getName() == "imm") {
-      DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild->getName());
+    } else if (DstChild.getOperator()->getName() == "imm") {
+      DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild->getOperator()->getName() == "fpimm") {
+    } else if (DstChild.getOperator()->getName() == "fpimm") {
       DstMIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(
-          DstChild->getName());
+          DstChild.getName());
       return InsertPt;
     }
 
-    if (DstChild->getOperator()->isSubClassOf("Instruction")) {
+    if (DstChild.getOperator()->isSubClassOf("Instruction")) {
       auto OpTy = getInstResultType(DstChild);
       if (!OpTy)
         return OpTy.takeError();
@@ -1238,22 +1237,22 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     }
 
     return failedImport("Dst pattern child isn't a leaf node or an MBB" +
-                        llvm::to_string(*DstChild));
+                        llvm::to_string(DstChild));
   }
 
   // It could be a specific immediate in which case we should just check for
   // that immediate.
   if (const IntInit *ChildIntInit =
-          dyn_cast<IntInit>(DstChild->getLeafValue())) {
+          dyn_cast<IntInit>(DstChild.getLeafValue())) {
     DstMIBuilder.addRenderer<ImmRenderer>(ChildIntInit->getValue());
     return InsertPt;
   }
 
   // Otherwise, we're looking for a bog-standard RegisterClass operand.
-  if (auto *ChildDefInit = dyn_cast<DefInit>(DstChild->getLeafValue())) {
+  if (auto *ChildDefInit = dyn_cast<DefInit>(DstChild.getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
-    ArrayRef<TypeSetByHwMode> ChildTypes = DstChild->getExtTypes();
+    ArrayRef<TypeSetByHwMode> ChildTypes = DstChild.getExtTypes();
     if (ChildTypes.size() != 1)
       return failedImport("Dst pattern child has multiple results");
 
@@ -1274,11 +1273,11 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       if (ChildRec->isSubClassOf("RegisterOperand") &&
           !ChildRec->isValueUnset("GIZeroRegister")) {
         DstMIBuilder.addRenderer<CopyOrAddZeroRegRenderer>(
-            DstChild->getName(), ChildRec->getValueAsDef("GIZeroRegister"));
+            DstChild.getName(), ChildRec->getValueAsDef("GIZeroRegister"));
         return InsertPt;
       }
 
-      DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
     }
 
@@ -1294,9 +1293,9 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
         return failedImport(
             "SelectionDAG ComplexPattern not mapped to GlobalISel");
 
-      const OperandMatcher &OM = Rule.getOperandMatcher(DstChild->getName());
+      const OperandMatcher &OM = Rule.getOperandMatcher(DstChild.getName());
       DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-          *ComplexPattern->second, DstChild->getName(),
+          *ComplexPattern->second, DstChild.getName(),
           OM.getAllocatedTemporariesBaseID());
       return InsertPt;
     }
@@ -1307,10 +1306,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
 
   // Handle the case where the MVT/register class is omitted in the dest pattern
   // but MVT exists in the source pattern.
-  if (isa<UnsetInit>(DstChild->getLeafValue())) {
-    for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++)
-      if (Src->getChild(NumOp)->getName() == DstChild->getName()) {
-        DstMIBuilder.addRenderer<CopyRenderer>(Src->getChild(NumOp)->getName());
+  if (isa<UnsetInit>(DstChild.getLeafValue())) {
+    for (unsigned NumOp = 0; NumOp < Src.getNumChildren(); NumOp++)
+      if (Src.getChild(NumOp).getName() == DstChild.getName()) {
+        DstMIBuilder.addRenderer<CopyRenderer>(Src.getChild(NumOp).getName());
         return InsertPt;
       }
   }
@@ -1318,8 +1317,8 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
 }
 
 Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
-    RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode *Src,
-    const TreePatternNode *Dst) {
+    RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode &Src,
+    const TreePatternNode &Dst) {
   auto InsertPtOrError = createInstructionRenderer(M.actions_end(), M, Dst);
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
@@ -1353,8 +1352,8 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
 
 Expected<action_iterator>
 GlobalISelEmitter::createAndImportSubInstructionRenderer(
-    const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
-    const TreePatternNode *Src, unsigned TempRegID) {
+    const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst,
+    const TreePatternNode &Src, unsigned TempRegID) {
   auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst);
 
   // TODO: Assert there's exactly one result.
@@ -1376,15 +1375,15 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // We need to make sure that when we import an INSERT_SUBREG as a
   // subinstruction that it ends up being constrained to the correct super
   // register and subregister classes.
-  auto OpName = Target.getInstruction(Dst->getOperator()).TheDef->getName();
+  auto OpName = Target.getInstruction(Dst.getOperator()).TheDef->getName();
   if (OpName == "INSERT_SUBREG") {
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from INSERT_SUBREG operand #1");
     std::optional<const CodeGenRegisterClass *> SuperClass =
-        inferSuperRegisterClassForNode(Dst->getExtType(0), Dst->getChild(0),
-                                       Dst->getChild(2));
+        inferSuperRegisterClassForNode(Dst.getExtType(0), Dst.getChild(0),
+                                       Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -1404,12 +1403,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
     // instructions, the result register class is controlled by the
     // subregisters of the operand. As a result, we must constrain the result
     // class rather than check that it's already the right one.
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class from EXTRACT_SUBREG operand #0");
 
-    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1));
     if (!SubIdx)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
@@ -1429,12 +1428,12 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // Similar to INSERT_SUBREG, we also have to handle SUBREG_TO_REG as a
   // subinstruction.
   if (OpName == "SUBREG_TO_REG") {
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from SUBREG_TO_REG child #1");
     auto SuperClass =
-        inferSuperRegisterClass(Dst->getExtType(0), Dst->getChild(2));
+        inferSuperRegisterClass(Dst.getExtType(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for SUBREG_TO_REG operand #0");
@@ -1446,13 +1445,13 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   }
 
   if (OpName == "REG_SEQUENCE") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     M.insertAction<ConstrainOperandToRegClassAction>(
         InsertPt, DstMIBuilder.getInsnID(), 0, **SuperClass);
 
-    unsigned Num = Dst->getNumChildren();
+    unsigned Num = Dst.getNumChildren();
     for (unsigned I = 1; I != Num; I += 2) {
-      const TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      const TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
       auto SubIdx = inferSubRegIndexForNode(SubRegChild);
       if (!SubIdx)
@@ -1474,8 +1473,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
 }
 
 Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
-    action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst) {
-  Record *DstOp = Dst->getOperator();
+    action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) {
+  Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction")) {
     if (DstOp->isSubClassOf("ValueType"))
       return failedImport(
@@ -1496,9 +1495,9 @@ Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const TreePatternNode *Src, const TreePatternNode *Dst) {
+    const TreePatternNode &Src, const TreePatternNode &Dst) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
-  const unsigned SrcNumDefs = Src->getExtTypes().size();
+  const unsigned SrcNumDefs = Src.getExtTypes().size();
   const unsigned DstNumDefs = DstI->Operands.NumDefs;
   if (DstNumDefs == 0)
     return InsertPt;
@@ -1513,11 +1512,11 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
 
   // Some instructions have multiple defs, but are missing a type entry
   // (e.g. s_cc_out operands).
-  if (Dst->getExtTypes().size() < DstNumDefs)
+  if (Dst.getExtTypes().size() < DstNumDefs)
     return failedImport("unhandled discarded def");
 
   for (unsigned I = SrcNumDefs; I < DstNumDefs; ++I) {
-    const TypeSetByHwMode &ExtTy = Dst->getExtType(I);
+    const TypeSetByHwMode &ExtTy = Dst.getExtType(I);
     if (!ExtTy.isMachineValueType())
       return failedImport("unsupported typeset");
 
@@ -1536,24 +1535,24 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) {
+    const llvm::TreePatternNode &Dst, const llvm::TreePatternNode &Src) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
-  CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator());
+  CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst.getOperator());
 
   StringRef Name = OrigDstI->TheDef->getName();
-  unsigned ExpectedDstINumUses = Dst->getNumChildren();
+  unsigned ExpectedDstINumUses = Dst.getNumChildren();
 
   // EXTRACT_SUBREG needs to use a subregister COPY.
   if (Name == "EXTRACT_SUBREG") {
-    if (!Dst->getChild(1)->isLeaf())
+    if (!Dst.getChild(1).isLeaf())
       return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
-    DefInit *SubRegInit = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
+    DefInit *SubRegInit = dyn_cast<DefInit>(Dst.getChild(1).getLeafValue());
     if (!SubRegInit)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
     CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
-    const TreePatternNode *ValChild = Dst->getChild(0);
-    if (!ValChild->isLeaf()) {
+    const TreePatternNode &ValChild = Dst.getChild(0);
+    if (!ValChild.isLeaf()) {
       // We really have to handle the source instruction, and then insert a
       // copy from the subregister.
       auto ExtractSrcTy = getInstResultType(ValChild);
@@ -1574,7 +1573,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     }
 
     // If this is a source operand, this is just a subregister copy.
-    Record *RCDef = getInitValueAsRegClass(ValChild->getLeafValue());
+    Record *RCDef = getInitValueAsRegClass(ValChild.getLeafValue());
     if (!RCDef)
       return failedImport("EXTRACT_SUBREG child #0 could not "
                           "be coerced to a register class");
@@ -1589,7 +1588,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
         return failedImport("EXTRACT_SUBREG requires an additional COPY");
     }
 
-    StringRef RegOperandName = Dst->getChild(0)->getName();
+    StringRef RegOperandName = Dst.getChild(0).getName();
     if (const auto &SubOperand = M.getComplexSubOperand(RegOperandName)) {
       DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
           *std::get<0>(*SubOperand), RegOperandName, std::get<1>(*SubOperand),
@@ -1602,10 +1601,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
   }
 
   if (Name == "REG_SEQUENCE") {
-    if (!Dst->getChild(0)->isLeaf())
+    if (!Dst.getChild(0).isLeaf())
       return failedImport("REG_SEQUENCE child #0 is not a leaf");
 
-    Record *RCDef = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+    Record *RCDef = getInitValueAsRegClass(Dst.getChild(0).getLeafValue());
     if (!RCDef)
       return failedImport("REG_SEQUENCE child #0 could not "
                           "be coerced to a register class");
@@ -1614,11 +1613,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
       return failedImport("Malformed REG_SEQUENCE");
 
     for (unsigned I = 1; I != ExpectedDstINumUses; I += 2) {
-      const TreePatternNode *ValChild = Dst->getChild(I);
-      const TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      const TreePatternNode &ValChild = Dst.getChild(I);
+      const TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
-      if (DefInit *SubRegInit =
-              dyn_cast<DefInit>(SubRegChild->getLeafValue())) {
+      if (DefInit *SubRegInit = dyn_cast<DefInit>(SubRegChild.getLeafValue())) {
         CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
 
         auto InsertPtOrError =
@@ -1676,7 +1674,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
 
     // If the operand has default values, introduce them now.
     if (CGP.operandHasDefault(OperandNode) &&
-        (InstOpNo < NonOverridableOperands || Child >= Dst->getNumChildren())) {
+        (InstOpNo < NonOverridableOperands || Child >= Dst.getNumChildren())) {
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
@@ -1691,7 +1689,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     }
 
     auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder,
-                                                     Dst->getChild(Child), Src);
+                                                     Dst.getChild(Child), Src);
     if (auto Error = InsertPtOrError.takeError())
       return std::move(Error);
     InsertPt = InsertPtOrError.get();
@@ -1712,14 +1710,14 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
     const DAGDefaultOperand &DefaultOp) const {
   for (const auto &Op : DefaultOp.DefaultOps) {
-    const auto *N = Op.get();
-    if (!N->isLeaf())
+    const auto &N = *Op;
+    if (!N.isLeaf())
       return failedImport("Could not add default op");
 
-    const auto *DefaultOp = N->getLeafValue();
+    const auto *DefaultOp = N.getLeafValue();
 
     if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
-      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N->getSimpleType(0));
+      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
       auto Def = DefaultDefOp->getDef();
       if (Def->getName() == "undef_tied_input") {
         unsigned TempRegID = M.allocateTempRegID();
@@ -1758,10 +1756,9 @@ Error GlobalISelEmitter::importImplicitDefRenderers(
 }
 
 std::optional<const CodeGenRegisterClass *>
-GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) {
-  assert(Leaf && "Expected node?");
-  assert(Leaf->isLeaf() && "Expected leaf?");
-  Record *RCRec = getInitValueAsRegClass(Leaf->getLeafValue());
+GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode &Leaf) {
+  assert(Leaf.isLeaf() && "Expected leaf?");
+  Record *RCRec = getInitValueAsRegClass(Leaf.getLeafValue());
   if (!RCRec)
     return std::nullopt;
   CodeGenRegisterClass *RC = CGRegs.getRegClass(RCRec);
@@ -1771,20 +1768,17 @@ GlobalISelEmitter::getRegClassFromLeaf(const TreePatternNode *Leaf) {
 }
 
 std::optional<const CodeGenRegisterClass *>
-GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
-  if (!N)
-    return std::nullopt;
-
-  if (N->isLeaf())
+GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) {
+  if (N.isLeaf())
     return getRegClassFromLeaf(N);
 
   // We don't have a leaf node, so we have to try and infer something. Check
   // that we have an instruction that we an infer something from.
 
   // Only handle things that produce a single type.
-  if (N->getNumTypes() != 1)
+  if (N.getNumTypes() != 1)
     return std::nullopt;
-  Record *OpRec = N->getOperator();
+  Record *OpRec = N.getOperator();
 
   // We only want instructions.
   if (!OpRec->isSubClassOf("Instruction"))
@@ -1803,21 +1797,21 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
   if (IsRegSequence || InstName == "COPY_TO_REGCLASS") {
     // If we have a COPY_TO_REGCLASS, then we need to handle it specially. It
     // has the desired register class as the first child.
-    const TreePatternNode *RCChild = N->getChild(IsRegSequence ? 0 : 1);
-    if (!RCChild->isLeaf())
+    const TreePatternNode &RCChild = N.getChild(IsRegSequence ? 0 : 1);
+    if (!RCChild.isLeaf())
       return std::nullopt;
     return getRegClassFromLeaf(RCChild);
   }
   if (InstName == "INSERT_SUBREG") {
-    const TreePatternNode *Child0 = N->getChild(0);
-    assert(Child0->getNumTypes() == 1 && "Unexpected number of types!");
-    const TypeSetByHwMode &VTy = Child0->getExtType(0);
-    return inferSuperRegisterClassForNode(VTy, Child0, N->getChild(2));
+    const TreePatternNode &Child0 = N.getChild(0);
+    assert(Child0.getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = Child0.getExtType(0);
+    return inferSuperRegisterClassForNode(VTy, Child0, N.getChild(2));
   }
   if (InstName == "EXTRACT_SUBREG") {
-    assert(N->getNumTypes() == 1 && "Unexpected number of types!");
-    const TypeSetByHwMode &VTy = N->getExtType(0);
-    return inferSuperRegisterClass(VTy, N->getChild(1));
+    assert(N.getNumTypes() == 1 && "Unexpected number of types!");
+    const TypeSetByHwMode &VTy = N.getExtType(0);
+    return inferSuperRegisterClass(VTy, N.getChild(1));
   }
 
   // Handle destination record types that we can safely infer a register class
@@ -1840,14 +1834,13 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode *N) {
 
 std::optional<const CodeGenRegisterClass *>
 GlobalISelEmitter::inferSuperRegisterClass(
-    const TypeSetByHwMode &Ty, const TreePatternNode *SubRegIdxNode) {
-  assert(SubRegIdxNode && "Expected subregister index node!");
+    const TypeSetByHwMode &Ty, const TreePatternNode &SubRegIdxNode) {
   // We need a ValueTypeByHwMode for getSuperRegForSubReg.
   if (!Ty.isValueTypeByHwMode(false))
     return std::nullopt;
-  if (!SubRegIdxNode->isLeaf())
+  if (!SubRegIdxNode.isLeaf())
     return std::nullopt;
-  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode.getLeafValue());
   if (!SubRegInit)
     return std::nullopt;
   CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
@@ -1864,9 +1857,8 @@ GlobalISelEmitter::inferSuperRegisterClass(
 
 std::optional<const CodeGenRegisterClass *>
 GlobalISelEmitter::inferSuperRegisterClassForNode(
-    const TypeSetByHwMode &Ty, const TreePatternNode *SuperRegNode,
-    const TreePatternNode *SubRegIdxNode) {
-  assert(SuperRegNode && "Expected super register node!");
+    const TypeSetByHwMode &Ty, const TreePatternNode &SuperRegNode,
+    const TreePatternNode &SubRegIdxNode) {
   // Check if we already have a defined register class for the super register
   // node. If we do, then we should preserve that rather than inferring anything
   // from the subregister index node. We can assume that whoever wrote the
@@ -1879,11 +1871,11 @@ GlobalISelEmitter::inferSuperRegisterClassForNode(
 }
 
 std::optional<CodeGenSubRegIndex *> GlobalISelEmitter::inferSubRegIndexForNode(
-    const TreePatternNode *SubRegIdxNode) {
-  if (!SubRegIdxNode->isLeaf())
+    const TreePatternNode &SubRegIdxNode) {
+  if (!SubRegIdxNode.isLeaf())
     return std::nullopt;
 
-  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode.getLeafValue());
   if (!SubRegInit)
     return std::nullopt;
   return CGRegs.getSubRegIdx(SubRegInit->getDef());
@@ -1894,9 +1886,9 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   int Score = P.getPatternComplexity(CGP);
   RuleMatcher M(P.getSrcRecord()->getLoc());
   RuleMatcherScores[M.getRuleID()] = Score;
-  M.addAction<DebugCommentAction>(llvm::to_string(*P.getSrcPattern()) +
+  M.addAction<DebugCommentAction>(llvm::to_string(P.getSrcPattern()) +
                                   "  =>  " +
-                                  llvm::to_string(*P.getDstPattern()));
+                                  llvm::to_string(P.getDstPattern()));
 
   SmallVector<Record *, 4> Predicates;
   P.getPredicateRecords(Predicates);
@@ -1907,8 +1899,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addHwModeIdx(declareHwModeCheck(P.getHwModeFeatures()));
 
   // Next, analyze the pattern operators.
-  TreePatternNode *Src = P.getSrcPattern();
-  TreePatternNode *Dst = P.getDstPattern();
+  TreePatternNode &Src = P.getSrcPattern();
+  TreePatternNode &Dst = P.getDstPattern();
 
   // If the root of either pattern isn't a simple operator, ignore it.
   if (auto Err = isTrivialOperatorNode(Dst))
@@ -1939,7 +1931,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   // the capture accross rules. The downside is that it would
   // introduce a dependency between predicates (captures must happen
   // before their first use.)
-  InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src->getName());
+  InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher(Src.getName());
   unsigned TempOpIdx = 0;
 
   const auto SavedFlags = M.setGISelFlags(P.getSrcRecord());
@@ -1950,8 +1942,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return std::move(Error);
   InstructionMatcher &InsnMatcher = InsnMatcherOrError.get();
 
-  if (Dst->isLeaf()) {
-    Record *RCDef = getInitValueAsRegClass(Dst->getLeafValue());
+  if (Dst.isLeaf()) {
+    Record *RCDef = getInitValueAsRegClass(Dst.getLeafValue());
     if (RCDef) {
       const CodeGenRegisterClass &RC = Target.getRegisterClass(RCDef);
 
@@ -1969,7 +1961,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       auto &DstMIBuilder =
           M.addAction<BuildMIAction>(M.allocateOutputInsnID(), &DstI);
       DstMIBuilder.addRenderer<CopyRenderer>(DstIOperand.Name);
-      DstMIBuilder.addRenderer<CopyRenderer>(Dst->getName());
+      DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
       M.addAction<ConstrainOperandToRegClassAction>(0, 0, RC);
 
       // Erase the root.
@@ -1986,7 +1978,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   }
 
   // Start with the defined operands (i.e., the results of the root operator).
-  Record *DstOp = Dst->getOperator();
+  Record *DstOp = Dst.getOperator();
   if (!DstOp->isSubClassOf("Instruction"))
     return failedImport("Pattern operator isn't an instruction");
 
@@ -1994,7 +1986,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   StringRef DstIName = DstI.TheDef->getName();
 
   unsigned DstNumDefs = DstI.Operands.NumDefs,
-           SrcNumDefs = Src->getExtTypes().size();
+           SrcNumDefs = Src.getExtTypes().size();
   if (DstNumDefs < SrcNumDefs) {
     if (DstNumDefs != 0)
       return failedImport("Src pattern result has more defs than dst MI (" +
@@ -2017,23 +2009,23 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   unsigned OpIdx = 0;
   unsigned N = std::min(DstNumDefs, SrcNumDefs);
   for (unsigned I = 0; I < N; ++I) {
-    const TypeSetByHwMode &VTy = Src->getExtType(I);
+    const TypeSetByHwMode &VTy = Src.getExtType(I);
 
     const auto &DstIOperand = DstI.Operands[OpIdx];
     PointerUnion<Record *, const CodeGenRegisterClass *> MatchedRC =
         DstIOperand.Rec;
     if (DstIName == "COPY_TO_REGCLASS") {
-      MatchedRC = getInitValueAsRegClass(Dst->getChild(1)->getLeafValue());
+      MatchedRC = getInitValueAsRegClass(Dst.getChild(1).getLeafValue());
 
       if (MatchedRC.isNull())
         return failedImport(
             "COPY_TO_REGCLASS operand #1 isn't a register class");
     } else if (DstIName == "REG_SEQUENCE") {
-      MatchedRC = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+      MatchedRC = getInitValueAsRegClass(Dst.getChild(0).getLeafValue());
       if (MatchedRC.isNull())
         return failedImport("REG_SEQUENCE operand #0 isn't a register class");
     } else if (DstIName == "EXTRACT_SUBREG") {
-      auto InferredClass = inferRegClassFromPattern(Dst->getChild(0));
+      auto InferredClass = inferRegClassFromPattern(Dst.getChild(0));
       if (!InferredClass)
         return failedImport(
             "Could not infer class for EXTRACT_SUBREG operand #0");
@@ -2042,8 +2034,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       // register.
       MatchedRC = (*InferredClass)->getDef();
     } else if (DstIName == "INSERT_SUBREG") {
-      auto MaybeSuperClass = inferSuperRegisterClassForNode(
-          VTy, Dst->getChild(0), Dst->getChild(2));
+      auto MaybeSuperClass =
+          inferSuperRegisterClassForNode(VTy, Dst.getChild(0), Dst.getChild(2));
       if (!MaybeSuperClass)
         return failedImport(
             "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -2052,7 +2044,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       // set DstIOpRec using this.
       MatchedRC = *MaybeSuperClass;
     } else if (DstIName == "SUBREG_TO_REG") {
-      auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst->getChild(2));
+      auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst.getChild(2));
       if (!MaybeRegClass)
         return failedImport(
             "Cannot infer register class for SUBREG_TO_REG operand #0");
@@ -2060,8 +2052,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     } else if (MatchedRC.get<Record *>()->isSubClassOf("RegisterOperand"))
       MatchedRC = MatchedRC.get<Record *>()->getValueAsDef("RegClass");
     else if (!MatchedRC.get<Record *>()->isSubClassOf("RegisterClass"))
-      return failedImport("Dst MI def isn't a register class" +
-                          to_string(*Dst));
+      return failedImport("Dst MI def isn't a register class" + to_string(Dst));
 
     OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
     // The operand names declared in the DstI instruction are unrelated to
@@ -2095,8 +2086,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   if (DstIName == "COPY_TO_REGCLASS") {
     // COPY_TO_REGCLASS does not provide operand constraints itself but the
     // result is constrained to the class given by the second child.
-    Record *DstIOpRec =
-        getInitValueAsRegClass(Dst->getChild(1)->getLeafValue());
+    Record *DstIOpRec = getInitValueAsRegClass(Dst.getChild(1).getLeafValue());
 
     if (DstIOpRec == nullptr)
       return failedImport("COPY_TO_REGCLASS operand #1 isn't a register class");
@@ -2104,12 +2094,12 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addAction<ConstrainOperandToRegClassAction>(
         0, 0, Target.getRegisterClass(DstIOpRec));
   } else if (DstIName == "EXTRACT_SUBREG") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class from EXTRACT_SUBREG operand #0");
 
-    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    auto SubIdx = inferSubRegIndexForNode(Dst.getChild(1));
     if (!SubIdx)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
@@ -2119,7 +2109,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     //
     // FIXME: This may introduce an extra copy if the chosen class doesn't
     //        actually contain the subregisters.
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of EXTRACT_SUBREG to have one result type");
 
     const auto SrcRCDstRCPair =
@@ -2134,16 +2124,16 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
                                                   *SrcRCDstRCPair->second);
     M.addAction<ConstrainOperandToRegClassAction>(0, 1, *SrcRCDstRCPair->first);
   } else if (DstIName == "INSERT_SUBREG") {
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of INSERT_SUBREG to have one result type");
     // We need to constrain the destination, a super regsister source, and a
     // subregister source.
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from INSERT_SUBREG operand #1");
     auto SuperClass = inferSuperRegisterClassForNode(
-        Src->getExtType(0), Dst->getChild(0), Dst->getChild(2));
+        Src.getExtType(0), Dst.getChild(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for INSERT_SUBREG operand #0");
@@ -2152,32 +2142,32 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
   } else if (DstIName == "SUBREG_TO_REG") {
     // We need to constrain the destination and subregister source.
-    assert(Src->getExtTypes().size() == 1 &&
+    assert(Src.getExtTypes().size() == 1 &&
            "Expected Src of SUBREG_TO_REG to have one result type");
 
     // Attempt to infer the subregister source from the first child. If it has
     // an explicitly given register class, we'll use that. Otherwise, we will
     // fail.
-    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    auto SubClass = inferRegClassFromPattern(Dst.getChild(1));
     if (!SubClass)
       return failedImport(
           "Cannot infer register class from SUBREG_TO_REG child #1");
     // We don't have a child to look at that might have a super register node.
     auto SuperClass =
-        inferSuperRegisterClass(Src->getExtType(0), Dst->getChild(2));
+        inferSuperRegisterClass(Src.getExtType(0), Dst.getChild(2));
     if (!SuperClass)
       return failedImport(
           "Cannot infer register class for SUBREG_TO_REG operand #0");
     M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
     M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
   } else if (DstIName == "REG_SEQUENCE") {
-    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    auto SuperClass = inferRegClassFromPattern(Dst.getChild(0));
 
     M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
 
-    unsigned Num = Dst->getNumChildren();
+    unsigned Num = Dst.getNumChildren();
     for (unsigned I = 1; I != Num; I += 2) {
-      TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+      TreePatternNode &SubRegChild = Dst.getChild(I + 1);
 
       auto SubIdx = inferSubRegIndexForNode(SubRegChild);
       if (!SubIdx)
diff --git a/llvm/utils/TableGen/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
index f7166ead9adc..d1bdc30849a7 100644
--- a/llvm/utils/TableGen/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
@@ -545,8 +545,8 @@ void GroupMatcher::optimize() {
     if (T != E)
       F = ++T;
   }
-  optimizeRules<GroupMatcher>(Matchers, MatcherStorage).swap(Matchers);
-  optimizeRules<SwitchMatcher>(Matchers, MatcherStorage).swap(Matchers);
+  Matchers = optimizeRules<GroupMatcher>(Matchers, MatcherStorage);
+  Matchers = optimizeRules<SwitchMatcher>(Matchers, MatcherStorage);
 }
 
 //===- SwitchMatcher ------------------------------------------------------===//
diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp
index 7e4ab5346621..6d9a35a68e13 100644
--- a/llvm/utils/TableGen/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/InfoByHwMode.cpp
@@ -11,8 +11,8 @@
 // data).
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenTarget.h"
 #include "InfoByHwMode.h"
+#include "CodeGenTarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
@@ -44,7 +44,7 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) {
     PtrAddrSpace = R->getValueAsInt("AddrSpace");
 }
 
-bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const {
+bool ValueTypeByHwMode::operator==(const ValueTypeByHwMode &T) const {
   assert(isValid() && T.isValid() && "Invalid type in assignment");
   bool Simple = isSimple();
   if (Simple != T.isSimple())
@@ -55,7 +55,7 @@ bool ValueTypeByHwMode::operator== (const ValueTypeByHwMode &T) const {
   return Map == T.Map;
 }
 
-bool ValueTypeByHwMode::operator< (const ValueTypeByHwMode &T) const {
+bool ValueTypeByHwMode::operator<(const ValueTypeByHwMode &T) const {
   assert(isValid() && T.isValid() && "Invalid type in comparison");
   // Default order for maps.
   return Map < T.Map;
@@ -86,7 +86,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
     return;
   }
 
-  std::vector<const PairType*> Pairs;
+  std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
   llvm::sort(Pairs, deref<std::less<PairType>>());
@@ -100,9 +100,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
 }
 
 LLVM_DUMP_METHOD
-void ValueTypeByHwMode::dump() const {
-  dbgs() << *this << '\n';
-}
+void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; }
 
 ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec,
                                              const CodeGenHwModes &CGH) {
@@ -123,24 +121,22 @@ RegSizeInfo::RegSizeInfo(Record *R, const CodeGenHwModes &CGH) {
   SpillAlignment = R->getValueAsInt("SpillAlignment");
 }
 
-bool RegSizeInfo::operator< (const RegSizeInfo &I) const {
+bool RegSizeInfo::operator<(const RegSizeInfo &I) const {
   return std::tie(RegSize, SpillSize, SpillAlignment) <
          std::tie(I.RegSize, I.SpillSize, I.SpillAlignment);
 }
 
 bool RegSizeInfo::isSubClassOf(const RegSizeInfo &I) const {
-  return RegSize <= I.RegSize &&
-         SpillAlignment && I.SpillAlignment % SpillAlignment == 0 &&
-         SpillSize <= I.SpillSize;
+  return RegSize <= I.RegSize && SpillAlignment &&
+         I.SpillAlignment % SpillAlignment == 0 && SpillSize <= I.SpillSize;
 }
 
 void RegSizeInfo::writeToStream(raw_ostream &OS) const {
-  OS << "[R=" << RegSize << ",S=" << SpillSize
-     << ",A=" << SpillAlignment << ']';
+  OS << "[R=" << RegSize << ",S=" << SpillSize << ",A=" << SpillAlignment
+     << ']';
 }
 
-RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R,
-      const CodeGenHwModes &CGH) {
+RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
   for (const HwModeSelect::PairType &P : MS.Items) {
     auto I = Map.insert({P.first, RegSizeInfo(P.second, CGH)});
@@ -149,12 +145,12 @@ RegSizeInfoByHwMode::RegSizeInfoByHwMode(Record *R,
   }
 }
 
-bool RegSizeInfoByHwMode::operator< (const RegSizeInfoByHwMode &I) const {
+bool RegSizeInfoByHwMode::operator<(const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   return get(M0) < I.get(M0);
 }
 
-bool RegSizeInfoByHwMode::operator== (const RegSizeInfoByHwMode &I) const {
+bool RegSizeInfoByHwMode::operator==(const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   return get(M0) == I.get(M0);
 }
@@ -164,8 +160,8 @@ bool RegSizeInfoByHwMode::isSubClassOf(const RegSizeInfoByHwMode &I) const {
   return get(M0).isSubClassOf(I.get(M0));
 }
 
-bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I)
-      const {
+bool RegSizeInfoByHwMode::hasStricterSpillThan(
+    const RegSizeInfoByHwMode &I) const {
   unsigned M0 = Map.begin()->first;
   const RegSizeInfo &A0 = get(M0);
   const RegSizeInfo &B0 = I.get(M0);
@@ -175,7 +171,7 @@ bool RegSizeInfoByHwMode::hasStricterSpillThan(const RegSizeInfoByHwMode &I)
 
 void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   typedef typename decltype(Map)::value_type PairType;
-  std::vector<const PairType*> Pairs;
+  std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
   llvm::sort(Pairs, deref<std::less<PairType>>());
@@ -187,7 +183,8 @@ void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   OS << '}';
 }
 
-EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH) {
+EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R,
+                                           const CodeGenHwModes &CGH) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
   for (const HwModeSelect::PairType &P : MS.Items) {
     assert(P.second && P.second->isSubClassOf("InstructionEncoding") &&
@@ -199,18 +196,18 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH)
 }
 
 namespace llvm {
-  raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
+}
 
-  raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) {
+  T.writeToStream(OS);
+  return OS;
+}
 
-  raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
-    T.writeToStream(OS);
-    return OS;
-  }
+raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
+  T.writeToStream(OS);
+  return OS;
 }
+} // namespace llvm
diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h
index 4692ab26db95..5f5329584148 100644
--- a/llvm/utils/TableGen/InfoByHwMode.h
+++ b/llvm/utils/TableGen/InfoByHwMode.h
@@ -40,8 +40,7 @@ enum : unsigned {
 };
 
 template <typename InfoT>
-void union_modes(const InfoByHwMode<InfoT> &A,
-                 const InfoByHwMode<InfoT> &B,
+void union_modes(const InfoByHwMode<InfoT> &A, const InfoByHwMode<InfoT> &B,
                  SmallVectorImpl<unsigned> &Modes) {
   auto AI = A.begin();
   auto BI = B.begin();
@@ -85,9 +84,8 @@ void union_modes(const InfoByHwMode<InfoT> &A,
     Modes.push_back(DefaultMode);
 }
 
-template <typename InfoT>
-struct InfoByHwMode {
-  typedef std::map<unsigned,InfoT> MapType;
+template <typename InfoT> struct InfoByHwMode {
+  typedef std::map<unsigned, InfoT> MapType;
   typedef typename MapType::value_type PairType;
   typedef typename MapType::iterator iterator;
   typedef typename MapType::const_iterator const_iterator;
@@ -98,11 +96,11 @@ struct InfoByHwMode {
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   iterator begin() { return Map.begin(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  iterator end()   { return Map.end(); }
+  iterator end() { return Map.end(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator begin() const { return Map.begin(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
-  const_iterator end() const   { return Map.end(); }
+  const_iterator end() const { return Map.end(); }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   bool empty() const { return Map.empty(); }
 
@@ -156,15 +154,13 @@ protected:
 struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
   ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH);
   ValueTypeByHwMode(Record *R, MVT T);
-  ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode,T}); }
+  ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); }
   ValueTypeByHwMode() = default;
 
-  bool operator== (const ValueTypeByHwMode &T) const;
-  bool operator< (const ValueTypeByHwMode &T) const;
+  bool operator==(const ValueTypeByHwMode &T) const;
+  bool operator<(const ValueTypeByHwMode &T) const;
 
-  bool isValid() const {
-    return !Map.empty();
-  }
+  bool isValid() const { return !Map.empty(); }
   MVT getType(unsigned Mode) const { return get(Mode); }
   MVT &getOrCreateTypeForMode(unsigned Mode, MVT Type);
 
@@ -178,8 +174,7 @@ struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
   }
 };
 
-ValueTypeByHwMode getValueTypeByHwMode(Record *Rec,
-                                       const CodeGenHwModes &CGH);
+ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH);
 
 struct RegSizeInfo {
   unsigned RegSize;
@@ -188,14 +183,12 @@ struct RegSizeInfo {
 
   RegSizeInfo(Record *R, const CodeGenHwModes &CGH);
   RegSizeInfo() = default;
-  bool operator< (const RegSizeInfo &I) const;
-  bool operator== (const RegSizeInfo &I) const {
+  bool operator<(const RegSizeInfo &I) const;
+  bool operator==(const RegSizeInfo &I) const {
     return std::tie(RegSize, SpillSize, SpillAlignment) ==
            std::tie(I.RegSize, I.SpillSize, I.SpillAlignment);
   }
-  bool operator!= (const RegSizeInfo &I) const {
-    return !(*this == I);
-  }
+  bool operator!=(const RegSizeInfo &I) const { return !(*this == I); }
 
   bool isSubClassOf(const RegSizeInfo &I) const;
   void writeToStream(raw_ostream &OS) const;
@@ -204,9 +197,9 @@ struct RegSizeInfo {
 struct RegSizeInfoByHwMode : public InfoByHwMode<RegSizeInfo> {
   RegSizeInfoByHwMode(Record *R, const CodeGenHwModes &CGH);
   RegSizeInfoByHwMode() = default;
-  bool operator< (const RegSizeInfoByHwMode &VI) const;
-  bool operator== (const RegSizeInfoByHwMode &VI) const;
-  bool operator!= (const RegSizeInfoByHwMode &VI) const {
+  bool operator<(const RegSizeInfoByHwMode &VI) const;
+  bool operator==(const RegSizeInfoByHwMode &VI) const;
+  bool operator!=(const RegSizeInfoByHwMode &VI) const {
     return !(*this == VI);
   }
 
@@ -224,7 +217,7 @@ raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T);
 
-struct EncodingInfoByHwMode : public InfoByHwMode<Record*> {
+struct EncodingInfoByHwMode : public InfoByHwMode<Record *> {
   EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH);
   EncodingInfoByHwMode() = default;
 };
diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp
index 616e7b589288..efabf6bb7ba6 100644
--- a/llvm/utils/TableGen/InstrDocsEmitter.cpp
+++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp
@@ -44,11 +44,18 @@ static std::string escapeForRST(StringRef Str) {
   for (char C : Str) {
     switch (C) {
     // We want special characters to be shown as their C escape codes.
-    case '\n': Result += "\\n"; break;
-    case '\t': Result += "\\t"; break;
+    case '\n':
+      Result += "\\n";
+      break;
+    case '\t':
+      Result += "\\t";
+      break;
     // Underscore at the end of a line has a special meaning in rst.
-    case '_': Result += "\\_"; break;
-    default: Result += C;
+    case '_':
+      Result += "\\_";
+      break;
+    default:
+      Result += C;
     }
   }
   return Result;
@@ -96,7 +103,10 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     std::vector<const char *> FlagStrings;
 #define xstr(s) str(s)
 #define str(s) #s
-#define FLAG(f) if (II->f) { FlagStrings.push_back(str(f)); }
+#define FLAG(f)                                                                \
+  if (II->f) {                                                                 \
+    FlagStrings.push_back(str(f));                                             \
+  }
     FLAG(isReturn)
     FLAG(isEHScopeReturn)
     FLAG(isBranch)
@@ -111,9 +121,9 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(isTrap)
     FLAG(canFoldAsLoad)
     FLAG(mayLoad)
-    //FLAG(mayLoad_Unset) // Deliberately omitted.
+    // FLAG(mayLoad_Unset) // Deliberately omitted.
     FLAG(mayStore)
-    //FLAG(mayStore_Unset) // Deliberately omitted.
+    // FLAG(mayStore_Unset) // Deliberately omitted.
     FLAG(isPredicable)
     FLAG(isConvertibleToThreeAddress)
     FLAG(isCommutable)
@@ -125,7 +135,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
     FLAG(hasCtrlDep)
     FLAG(isNotDuplicable)
     FLAG(hasSideEffects)
-    //FLAG(hasSideEffects_Unset) // Deliberately omitted.
+    // FLAG(hasSideEffects_Unset) // Deliberately omitted.
     FLAG(isAsCheapAsAMove)
     FLAG(hasExtraSrcRegAllocReq)
     FLAG(hasExtraDefRegAllocReq)
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index dbc5c22f2387..2d08447429d9 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -53,8 +53,8 @@ class InstrInfoEmitter {
   const CodeGenSchedModels &SchedModels;
 
 public:
-  InstrInfoEmitter(RecordKeeper &R):
-    Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {}
+  InstrInfoEmitter(RecordKeeper &R)
+      : Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {}
 
   // run - Output the instruction set description.
   void run(raw_ostream &OS);
@@ -69,8 +69,8 @@ private:
   /// The keys of this map are maps which have OpName enum values as their keys
   /// and instruction operand indices as their values.  The values of this map
   /// are lists of instruction names.
-  typedef std::map<std::map<unsigned, unsigned>,
-                   std::vector<std::string>> OpNameMapTy;
+  typedef std::map<std::map<unsigned, unsigned>, std::vector<std::string>>
+      OpNameMapTy;
   typedef std::map<std::string, unsigned>::iterator StrUintMapIter;
 
   /// Generate member functions in the target-specific GenInstrInfo class.
@@ -94,13 +94,14 @@ private:
   void emitOperandTypeMappings(
       raw_ostream &OS, const CodeGenTarget &Target,
       ArrayRef<const CodeGenInstruction *> NumberedInstructions);
-  void initOperandMapData(
-            ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-            StringRef Namespace,
-            std::map<std::string, unsigned> &Operands,
-            OpNameMapTy &OperandMap);
-  void emitOperandNameMappings(raw_ostream &OS, const CodeGenTarget &Target,
-            ArrayRef<const CodeGenInstruction*> NumberedInstructions);
+  void
+  initOperandMapData(ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+                     StringRef Namespace,
+                     std::map<std::string, unsigned> &Operands,
+                     OpNameMapTy &OperandMap);
+  void emitOperandNameMappings(
+      raw_ostream &OS, const CodeGenTarget &Target,
+      ArrayRef<const CodeGenInstruction *> NumberedInstructions);
 
   void emitLogicalOperandSizeMappings(
       raw_ostream &OS, StringRef Namespace,
@@ -193,8 +194,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
       // Fill in constraint info.
       Res += ", ";
 
-      const CGIOperandList::ConstraintInfo &Constraint =
-        Op.Constraints[j];
+      const CGIOperandList::ConstraintInfo &Constraint = Op.Constraints[j];
       if (Constraint.isNone())
         Res += "0";
       else if (Constraint.isEarlyClobber())
@@ -246,10 +246,9 @@ void InstrInfoEmitter::EmitOperandInfo(raw_ostream &OS,
 ///        each instructions.  This is used to generate the OperandMap table as
 ///        well as the getNamedOperandIdx() function.
 void InstrInfoEmitter::initOperandMapData(
-        ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-        StringRef Namespace,
-        std::map<std::string, unsigned> &Operands,
-        OpNameMapTy &OperandMap) {
+    ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+    StringRef Namespace, std::map<std::string, unsigned> &Operands,
+    OpNameMapTy &OperandMap) {
   unsigned NumOperands = 0;
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
     if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable"))
@@ -259,13 +258,13 @@ void InstrInfoEmitter::initOperandMapData(
       StrUintMapIter I = Operands.find(Info.Name);
 
       if (I == Operands.end()) {
-        I = Operands.insert(Operands.begin(),
-                    std::pair<std::string, unsigned>(Info.Name, NumOperands++));
+        I = Operands.insert(Operands.begin(), std::pair<std::string, unsigned>(
+                                                  Info.Name, NumOperands++));
       }
       OpList[I->second] = Info.MIOperandNo;
     }
-    OperandMap[OpList].push_back(Namespace.str() + "::" +
-                                 Inst->TheDef->getName().str());
+    OperandMap[OpList].push_back(Namespace.str() +
+                                 "::" + Inst->TheDef->getName().str());
   }
 }
 
@@ -280,9 +279,9 @@ void InstrInfoEmitter::initOperandMapData(
 /// - A function called getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
 ///   for looking up the operand index for an instruction, given a value from
 ///   OpName enum
-void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
-           const CodeGenTarget &Target,
-           ArrayRef<const CodeGenInstruction*> NumberedInstructions) {
+void InstrInfoEmitter::emitOperandNameMappings(
+    raw_ostream &OS, const CodeGenTarget &Target,
+    ArrayRef<const CodeGenInstruction *> NumberedInstructions) {
   StringRef Namespace = Target.getInstNamespace();
   std::string OpNameNS = "OpName";
   // Map of operand names to their enumeration value.  This will be used to
@@ -380,7 +379,8 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     }
   }
 
-  OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
+  OS << "  OPERAND_TYPE_LIST_END"
+     << "\n};\n";
   OS << "} // end namespace OpTypes\n";
   OS << "} // end namespace " << Namespace << "\n";
   OS << "} // end namespace llvm\n";
@@ -685,7 +685,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 
   for (const Record *Rec : TIIPredicates) {
     OS << "bool " << Rec->getValueAsString("FunctionName")
-        << "(const MCInst &MI);\n";
+       << "(const MCInst &MI);\n";
   }
 
   OS << "void verifyInstructionPredicates(unsigned Opcode, const FeatureBitset "
@@ -939,7 +939,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   // Collect all of the instruction's implicit uses and defs.
   Records.startTimer("Collect uses/defs");
-  std::map<std::vector<Record*>, unsigned> EmittedLists;
+  std::map<std::vector<Record *>, unsigned> EmittedLists;
   std::vector<std::vector<Record *>> ImplicitLists;
   unsigned ImplicitListSize = 0;
   for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) {
@@ -1017,7 +1017,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName +
                                           "InstrNameData[]");
 
-  OS << "extern const unsigned " << TargetName <<"InstrNameIndices[] = {";
+  OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
   Num = 0;
   for (const CodeGenInstruction *Inst : NumberedInstructions) {
     // Newline every eight entries.
@@ -1104,7 +1104,6 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
         "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
      << "  ~" << ClassName << "() override = default;\n";
 
-
   OS << "\n};\n} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
@@ -1180,8 +1179,8 @@ void InstrInfoEmitter::emitRecord(
   int MinOperands = 0;
   if (!Inst.Operands.empty())
     // Each logical operand can be multiple MI operands.
-    MinOperands = Inst.Operands.back().MIOperandNo +
-                  Inst.Operands.back().MINumOperands;
+    MinOperands =
+        Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands;
 
   OS << "    { ";
   OS << Num << ",\t" << MinOperands << ",\t" << Inst.Operands.NumDefs << ",\t"
@@ -1202,49 +1201,88 @@ void InstrInfoEmitter::emitRecord(
   OS << OperandInfoMap.find(OperandInfo)->second << ",\t0";
 
   // Emit all of the target independent flags...
-  if (Inst.isPreISelOpcode)    OS << "|(1ULL<<MCID::PreISelOpcode)";
-  if (Inst.isPseudo)           OS << "|(1ULL<<MCID::Pseudo)";
-  if (Inst.isMeta)             OS << "|(1ULL<<MCID::Meta)";
-  if (Inst.isReturn)           OS << "|(1ULL<<MCID::Return)";
-  if (Inst.isEHScopeReturn)    OS << "|(1ULL<<MCID::EHScopeReturn)";
-  if (Inst.isBranch)           OS << "|(1ULL<<MCID::Branch)";
-  if (Inst.isIndirectBranch)   OS << "|(1ULL<<MCID::IndirectBranch)";
-  if (Inst.isCompare)          OS << "|(1ULL<<MCID::Compare)";
-  if (Inst.isMoveImm)          OS << "|(1ULL<<MCID::MoveImm)";
-  if (Inst.isMoveReg)          OS << "|(1ULL<<MCID::MoveReg)";
-  if (Inst.isBitcast)          OS << "|(1ULL<<MCID::Bitcast)";
-  if (Inst.isAdd)              OS << "|(1ULL<<MCID::Add)";
-  if (Inst.isTrap)             OS << "|(1ULL<<MCID::Trap)";
-  if (Inst.isSelect)           OS << "|(1ULL<<MCID::Select)";
-  if (Inst.isBarrier)          OS << "|(1ULL<<MCID::Barrier)";
-  if (Inst.hasDelaySlot)       OS << "|(1ULL<<MCID::DelaySlot)";
-  if (Inst.isCall)             OS << "|(1ULL<<MCID::Call)";
-  if (Inst.canFoldAsLoad)      OS << "|(1ULL<<MCID::FoldableAsLoad)";
-  if (Inst.mayLoad)            OS << "|(1ULL<<MCID::MayLoad)";
-  if (Inst.mayStore)           OS << "|(1ULL<<MCID::MayStore)";
-  if (Inst.mayRaiseFPException) OS << "|(1ULL<<MCID::MayRaiseFPException)";
-  if (Inst.isPredicable)       OS << "|(1ULL<<MCID::Predicable)";
-  if (Inst.isConvertibleToThreeAddress) OS << "|(1ULL<<MCID::ConvertibleTo3Addr)";
-  if (Inst.isCommutable)       OS << "|(1ULL<<MCID::Commutable)";
-  if (Inst.isTerminator)       OS << "|(1ULL<<MCID::Terminator)";
-  if (Inst.isReMaterializable) OS << "|(1ULL<<MCID::Rematerializable)";
-  if (Inst.isNotDuplicable)    OS << "|(1ULL<<MCID::NotDuplicable)";
-  if (Inst.Operands.hasOptionalDef) OS << "|(1ULL<<MCID::HasOptionalDef)";
-  if (Inst.usesCustomInserter) OS << "|(1ULL<<MCID::UsesCustomInserter)";
-  if (Inst.hasPostISelHook)    OS << "|(1ULL<<MCID::HasPostISelHook)";
-  if (Inst.Operands.isVariadic)OS << "|(1ULL<<MCID::Variadic)";
-  if (Inst.hasSideEffects)     OS << "|(1ULL<<MCID::UnmodeledSideEffects)";
-  if (Inst.isAsCheapAsAMove)   OS << "|(1ULL<<MCID::CheapAsAMove)";
+  if (Inst.isPreISelOpcode)
+    OS << "|(1ULL<<MCID::PreISelOpcode)";
+  if (Inst.isPseudo)
+    OS << "|(1ULL<<MCID::Pseudo)";
+  if (Inst.isMeta)
+    OS << "|(1ULL<<MCID::Meta)";
+  if (Inst.isReturn)
+    OS << "|(1ULL<<MCID::Return)";
+  if (Inst.isEHScopeReturn)
+    OS << "|(1ULL<<MCID::EHScopeReturn)";
+  if (Inst.isBranch)
+    OS << "|(1ULL<<MCID::Branch)";
+  if (Inst.isIndirectBranch)
+    OS << "|(1ULL<<MCID::IndirectBranch)";
+  if (Inst.isCompare)
+    OS << "|(1ULL<<MCID::Compare)";
+  if (Inst.isMoveImm)
+    OS << "|(1ULL<<MCID::MoveImm)";
+  if (Inst.isMoveReg)
+    OS << "|(1ULL<<MCID::MoveReg)";
+  if (Inst.isBitcast)
+    OS << "|(1ULL<<MCID::Bitcast)";
+  if (Inst.isAdd)
+    OS << "|(1ULL<<MCID::Add)";
+  if (Inst.isTrap)
+    OS << "|(1ULL<<MCID::Trap)";
+  if (Inst.isSelect)
+    OS << "|(1ULL<<MCID::Select)";
+  if (Inst.isBarrier)
+    OS << "|(1ULL<<MCID::Barrier)";
+  if (Inst.hasDelaySlot)
+    OS << "|(1ULL<<MCID::DelaySlot)";
+  if (Inst.isCall)
+    OS << "|(1ULL<<MCID::Call)";
+  if (Inst.canFoldAsLoad)
+    OS << "|(1ULL<<MCID::FoldableAsLoad)";
+  if (Inst.mayLoad)
+    OS << "|(1ULL<<MCID::MayLoad)";
+  if (Inst.mayStore)
+    OS << "|(1ULL<<MCID::MayStore)";
+  if (Inst.mayRaiseFPException)
+    OS << "|(1ULL<<MCID::MayRaiseFPException)";
+  if (Inst.isPredicable)
+    OS << "|(1ULL<<MCID::Predicable)";
+  if (Inst.isConvertibleToThreeAddress)
+    OS << "|(1ULL<<MCID::ConvertibleTo3Addr)";
+  if (Inst.isCommutable)
+    OS << "|(1ULL<<MCID::Commutable)";
+  if (Inst.isTerminator)
+    OS << "|(1ULL<<MCID::Terminator)";
+  if (Inst.isReMaterializable)
+    OS << "|(1ULL<<MCID::Rematerializable)";
+  if (Inst.isNotDuplicable)
+    OS << "|(1ULL<<MCID::NotDuplicable)";
+  if (Inst.Operands.hasOptionalDef)
+    OS << "|(1ULL<<MCID::HasOptionalDef)";
+  if (Inst.usesCustomInserter)
+    OS << "|(1ULL<<MCID::UsesCustomInserter)";
+  if (Inst.hasPostISelHook)
+    OS << "|(1ULL<<MCID::HasPostISelHook)";
+  if (Inst.Operands.isVariadic)
+    OS << "|(1ULL<<MCID::Variadic)";
+  if (Inst.hasSideEffects)
+    OS << "|(1ULL<<MCID::UnmodeledSideEffects)";
+  if (Inst.isAsCheapAsAMove)
+    OS << "|(1ULL<<MCID::CheapAsAMove)";
   if (!Target.getAllowRegisterRenaming() || Inst.hasExtraSrcRegAllocReq)
     OS << "|(1ULL<<MCID::ExtraSrcRegAllocReq)";
   if (!Target.getAllowRegisterRenaming() || Inst.hasExtraDefRegAllocReq)
     OS << "|(1ULL<<MCID::ExtraDefRegAllocReq)";
-  if (Inst.isRegSequence) OS << "|(1ULL<<MCID::RegSequence)";
-  if (Inst.isExtractSubreg) OS << "|(1ULL<<MCID::ExtractSubreg)";
-  if (Inst.isInsertSubreg) OS << "|(1ULL<<MCID::InsertSubreg)";
-  if (Inst.isConvergent) OS << "|(1ULL<<MCID::Convergent)";
-  if (Inst.variadicOpsAreDefs) OS << "|(1ULL<<MCID::VariadicOpsAreDefs)";
-  if (Inst.isAuthenticated) OS << "|(1ULL<<MCID::Authenticated)";
+  if (Inst.isRegSequence)
+    OS << "|(1ULL<<MCID::RegSequence)";
+  if (Inst.isExtractSubreg)
+    OS << "|(1ULL<<MCID::ExtractSubreg)";
+  if (Inst.isInsertSubreg)
+    OS << "|(1ULL<<MCID::InsertSubreg)";
+  if (Inst.isConvergent)
+    OS << "|(1ULL<<MCID::Convergent)";
+  if (Inst.variadicOpsAreDefs)
+    OS << "|(1ULL<<MCID::VariadicOpsAreDefs)";
+  if (Inst.isAuthenticated)
+    OS << "|(1ULL<<MCID::Authenticated)";
 
   // Emit all of the target-specific flags...
   BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags");
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 28604c5600bf..f7ae5ed8a6b3 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -60,8 +60,8 @@ public:
                                     raw_ostream &OS);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
-  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints, bool IsClang,
-                                 raw_ostream &OS);
+  void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
+                                 bool IsClang, raw_ostream &OS);
 };
 } // End anonymous namespace
 
@@ -204,7 +204,7 @@ void IntrinsicEmitter::EmitIITInfo(raw_ostream &OS) {
 }
 
 void IntrinsicEmitter::EmitTargetInfo(const CodeGenIntrinsicTable &Ints,
-                                    raw_ostream &OS) {
+                                      raw_ostream &OS) {
   OS << "// Target mapping\n";
   OS << "#ifdef GET_INTRINSIC_TARGET_DATA\n";
   OS << "struct IntrinsicTargetInfo {\n"
@@ -238,10 +238,10 @@ void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
   OS << "  0";
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    if ((i+1)%8 == 0)
+    if ((i + 1) % 8 == 0)
       OS << ",\n  0";
     if (Ints[i].isOverloaded)
-      OS << " | (1<<" << (i+1)%8 << ')';
+      OS << " | (1<<" << (i + 1) % 8 << ')';
   }
   OS << "\n};\n\n";
   // OTable contains a true bit at the position if the intrinsic is overloaded.
@@ -271,7 +271,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
   // capture it in this vector, otherwise store a ~0U.
   std::vector<unsigned> FixedEncodings;
 
-  SequenceToOffsetTable<std::vector<unsigned char> > LongEncodingTable;
+  SequenceToOffsetTable<std::vector<unsigned char>> LongEncodingTable;
 
   std::vector<unsigned char> TypeSig;
 
@@ -292,7 +292,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
           Failed = true;
           break;
         }
-        Result = (Result << 4) | TypeSig[e-i-1];
+        Result = (Result << 4) | TypeSig[e - i - 1];
       }
 
       // If this could be encoded into a 31-bit word, return it.
@@ -330,7 +330,6 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     TypeSig.clear();
     ComputeFixedEncoding(Ints[i], TypeSig);
 
-
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
     OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
@@ -344,7 +343,7 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     LongEncodingTable.emit(OS, printIITEntry);
   OS << "  255\n};\n\n";
 
-  OS << "#endif\n\n";  // End of GET_INTRINSIC_GENERATOR_GLOBAL
+  OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 
 namespace {
@@ -393,7 +392,8 @@ std::optional<bool> compareFnAttributes(const CodeGenIntrinsic *L,
   // Try to order by readonly/readnone attribute.
   uint32_t LK = L->ME.toIntValue();
   uint32_t RK = R->ME.toIntValue();
-  if (LK != RK) return (LK > RK);
+  if (LK != RK)
+    return (LK > RK);
 
   return std::nullopt;
 }
@@ -438,8 +438,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
       if (!UniqArgAttributes.try_emplace(Attrs, ID).second)
         continue;
 
-      assert(is_sorted(Attrs) &&
-             "Argument attributes are not sorted");
+      assert(is_sorted(Attrs) && "Argument attributes are not sorted");
 
       OS << "  case " << ID << ":\n";
       OS << "    return AttributeSet::get(C, {\n";
@@ -473,8 +472,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
           OS << "      Attribute::get(C, Attribute::ImmArg),\n";
           break;
         case CodeGenIntrinsic::Alignment:
-          OS << "      Attribute::get(C, Attribute::Alignment, "
-             << Attr.Value << "),\n";
+          OS << "      Attribute::get(C, Attribute::Alignment, " << Attr.Value
+             << "),\n";
           break;
         case CodeGenIntrinsic::Dereferenceable:
           OS << "      Attribute::get(C, Attribute::Dereferenceable, "
@@ -489,7 +488,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   OS << "}\n\n";
 
   // Compute unique function attribute sets.
-  std::map<const CodeGenIntrinsic*, unsigned, FnAttributeComparator>
+  std::map<const CodeGenIntrinsic *, unsigned, FnAttributeComparator>
       UniqFnAttributes;
   OS << "static AttributeSet getIntrinsicFnAttributeSet("
      << "LLVMContext &C, unsigned ID) {\n"
@@ -542,17 +541,18 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
 
   // Compute the maximum number of attribute arguments and the map
-  typedef std::map<const CodeGenIntrinsic*, unsigned,
-                   AttributeComparator> UniqAttrMapTy;
+  typedef std::map<const CodeGenIntrinsic *, unsigned, AttributeComparator>
+      UniqAttrMapTy;
   UniqAttrMapTy UniqAttributes;
   unsigned maxArgAttrs = 0;
   unsigned AttrNum = 0;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];
     maxArgAttrs =
-      std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
+        std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
     unsigned &N = UniqAttributes[&intrinsic];
-    if (N) continue;
+    if (N)
+      continue;
     N = ++AttrNum;
     assert(N < 65536 && "Too many unique attributes for table!");
   }
@@ -564,8 +564,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];
 
-    OS << "    " << UniqAttributes[&intrinsic] << ", // "
-       << intrinsic.Name << "\n";
+    OS << "    " << UniqAttributes[&intrinsic] << ", // " << intrinsic.Name
+       << "\n";
   }
   OS << "  };\n\n";
 
diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp
index 257cd44d9e80..0f081197b5b8 100644
--- a/llvm/utils/TableGen/OptParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptParserEmitter.cpp
@@ -196,9 +196,9 @@ static MarshallingInfo createMarshallingInfo(const Record &R) {
 /// working with those options when given an input command line.
 static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
   // Get the option groups and options.
-  const std::vector<Record*> &Groups =
-    Records.getAllDerivedDefinitions("OptionGroup");
-  std::vector<Record*> Opts = Records.getAllDerivedDefinitions("Option");
+  const std::vector<Record *> &Groups =
+      Records.getAllDerivedDefinitions("OptionGroup");
+  std::vector<Record *> Opts = Records.getAllDerivedDefinitions("Option");
 
   emitSourceFileHeader("Option Parsing Definitions", OS);
 
@@ -423,8 +423,7 @@ static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) {
       write_cstring(OS, R.getValueAsString("Values"));
     else if (!isa<UnsetInit>(R.getValueInit("ValuesCode"))) {
       OS << getOptionName(R) << "_Values";
-    }
-    else
+    } else
       OS << "nullptr";
   };
 
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index 0b9b6389fe38..d0a35ff82df6 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -101,7 +101,6 @@ void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
   OS << Reg->getName();
 }
 
-
 void PredicateExpander::expandCheckRegOperandSimple(raw_ostream &OS,
                                                     int OpIndex,
                                                     StringRef FunctionMapper) {
@@ -487,7 +486,8 @@ void STIPredicateExpander::expandPrologue(raw_ostream &OS,
   OS << "unsigned ProcessorID = getSchedModel().getProcessorID();\n";
 }
 
-void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup &Group,
+void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS,
+                                             const OpcodeGroup &Group,
                                              bool ShouldUpdateOpcodeMask) {
   const OpcodeInfo &OI = Group.getOpcodeInfo();
   for (const PredicateInfo &PI : OI.getPredicates()) {
diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index e07fb9188098..7f692f29192d 100644
--- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -27,19 +27,19 @@ class PseudoLoweringEmitter {
     enum MapKind { Operand, Imm, Reg };
     MapKind Kind;
     union {
-      unsigned Operand;   // Operand number mapped to.
-      uint64_t Imm;       // Integer immedate value.
-      Record *Reg;        // Physical register.
+      unsigned Operand; // Operand number mapped to.
+      uint64_t Imm;     // Integer immedate value.
+      Record *Reg;      // Physical register.
     } Data;
   };
   struct PseudoExpansion {
-    CodeGenInstruction Source;  // The source pseudo instruction definition.
-    CodeGenInstruction Dest;    // The destination instruction to lower to.
+    CodeGenInstruction Source; // The source pseudo instruction definition.
+    CodeGenInstruction Dest;   // The destination instruction to lower to.
     IndexedMap<OpData> OperandMap;
 
     PseudoExpansion(CodeGenInstruction &s, CodeGenInstruction &d,
-                    IndexedMap<OpData> &m) :
-      Source(s), Dest(d), OperandMap(m) {}
+                    IndexedMap<OpData> &m)
+        : Source(s), Dest(d), OperandMap(m) {}
   };
 
   RecordKeeper &Records;
@@ -57,6 +57,7 @@ class PseudoLoweringEmitter {
                                 unsigned BaseIdx);
   void evaluateExpansion(Record *Pseudo);
   void emitLoweringEmitter(raw_ostream &o);
+
 public:
   PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {}
 
@@ -69,9 +70,9 @@ public:
 //        The pseudo expansion really should take a list of dags, not just
 //        a single dag, so we can do fancier things.
 
-unsigned PseudoLoweringEmitter::
-addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
-                     IndexedMap<OpData> &OperandMap, unsigned BaseIdx) {
+unsigned PseudoLoweringEmitter::addDagOperandMapping(
+    Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
+    IndexedMap<OpData> &OperandMap, unsigned BaseIdx) {
   unsigned OpsAdded = 0;
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) {
     if (DefInit *DI = dyn_cast<DefInit>(Dag->getArg(i))) {
@@ -92,9 +93,9 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // FIXME: Are the message operand types backward?
       if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec) {
         PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                        "', operand type '" + DI->getDef()->getName() +
-                        "' does not match expansion operand type '" +
-                        Insn.Operands[BaseIdx + i].Rec->getName() + "'");
+                            "', operand type '" + DI->getDef()->getName() +
+                            "' does not match expansion operand type '" +
+                            Insn.Operands[BaseIdx + i].Rec->getName() + "'");
         PrintFatalNote(DI->getDef(),
                        "Value was assigned at the following location:");
       }
@@ -118,7 +119,7 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // Just add the operands recursively. This is almost certainly
       // a constant value for a complex operand (> 1 MI operand).
       unsigned NewOps =
-        addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i);
+          addDagOperandMapping(Rec, SubDag, Insn, OperandMap, BaseIdx + i);
       OpsAdded += NewOps;
       // Since we added more than one, we also need to adjust the base.
       BaseIdx += NewOps - 1;
@@ -140,15 +141,15 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
   DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator is not a record");
+                        "', result operator is not a record");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
   Record *Operator = OpDef->getDef();
   if (!Operator->isSubClassOf("Instruction")) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' is not an instruction");
+                        "', result operator '" + Operator->getName() +
+                        "' is not an instruction");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
@@ -157,16 +158,16 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
 
   if (Insn.isCodeGenOnly || Insn.isPseudo) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' cannot be a pseudo instruction");
+                        "', result operator '" + Operator->getName() +
+                        "' cannot be a pseudo instruction");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
 
   if (Insn.Operands.size() != Dag->getNumArgs()) {
     PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                    "', result operator '" + Operator->getName() +
-                    "' has the wrong number of operands");
+                        "', result operator '" + Operator->getName() +
+                        "' has the wrong number of operands");
     PrintFatalNote(Rec->getValue("ResultInst"),
                    "Result was assigned at the following location:");
   }
@@ -201,11 +202,11 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
     if (OperandMap[Insn.Operands[i].MIOperandNo].Kind != OpData::Operand)
       continue;
     StringMap<unsigned>::iterator SourceOp =
-      SourceOperands.find(Dag->getArgNameStr(i));
+        SourceOperands.find(Dag->getArgNameStr(i));
     if (SourceOp == SourceOperands.end()) {
       PrintError(Rec, "In pseudo instruction '" + Rec->getName() +
-                      "', output operand '" + Dag->getArgNameStr(i) +
-                      "' has no matching source operand");
+                          "', output operand '" + Dag->getArgNameStr(i) +
+                          "' has no matching source operand");
       PrintFatalNote(Rec->getValue("ResultInst"),
                      "Value was assigned at the following location:");
     }
@@ -213,7 +214,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
     // MachineInstr operand.
     for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I)
       OperandMap[Insn.Operands[i].MIOperandNo + I].Data.Operand =
-        SourceOp->getValue();
+          SourceOp->getValue();
 
     LLVM_DEBUG(dbgs() << "    " << SourceOp->getValue() << " ==> " << i
                       << "\n");
@@ -226,7 +227,8 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
   // Emit file header.
   emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
 
-  o << "bool " << Target.getName() + "AsmPrinter" << "::\n"
+  o << "bool " << Target.getName() + "AsmPrinter"
+    << "::\n"
     << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
     << "                            const MachineInstr *MI) {\n";
 
@@ -236,12 +238,12 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
     for (auto &Expansion : Expansions) {
       CodeGenInstruction &Source = Expansion.Source;
       CodeGenInstruction &Dest = Expansion.Dest;
-      o << "  case " << Source.Namespace << "::"
-        << Source.TheDef->getName() << ": {\n"
+      o << "  case " << Source.Namespace << "::" << Source.TheDef->getName()
+        << ": {\n"
         << "    MCInst TmpInst;\n"
         << "    MCOperand MCOp;\n"
-        << "    TmpInst.setOpcode(" << Dest.Namespace << "::"
-        << Dest.TheDef->getName() << ");\n";
+        << "    TmpInst.setOpcode(" << Dest.Namespace
+        << "::" << Dest.TheDef->getName() << ");\n";
 
       // Copy the operands from the source instruction.
       // FIXME: Instruction operands with defaults values (predicates and cc_out
@@ -252,29 +254,29 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
         o << "    // Operand: " << DestOperand.Name << "\n";
         for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) {
           switch (Expansion.OperandMap[MIOpNo + i].Kind) {
-            case OpData::Operand:
+          case OpData::Operand:
             o << "    lowerOperand(MI->getOperand("
-              << Source.Operands[Expansion.OperandMap[MIOpNo].Data
-              .Operand].MIOperandNo + i
+              << Source.Operands[Expansion.OperandMap[MIOpNo].Data.Operand]
+                         .MIOperandNo +
+                     i
               << "), MCOp);\n"
               << "    TmpInst.addOperand(MCOp);\n";
             break;
-            case OpData::Imm:
+          case OpData::Imm:
             o << "    TmpInst.addOperand(MCOperand::createImm("
               << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
             break;
-            case OpData::Reg: {
-              Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-              o << "    TmpInst.addOperand(MCOperand::createReg(";
-              // "zero_reg" is special.
-              if (Reg->getName() == "zero_reg")
-                o << "0";
-              else
-                o << Reg->getValueAsString("Namespace") << "::"
-                  << Reg->getName();
-              o << "));\n";
-              break;
-            }
+          case OpData::Reg: {
+            Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
+            o << "    TmpInst.addOperand(MCOperand::createReg(";
+            // "zero_reg" is special.
+            if (Reg->getName() == "zero_reg")
+              o << "0";
+            else
+              o << Reg->getValueAsString("Namespace") << "::" << Reg->getName();
+            o << "));\n";
+            break;
+          }
           }
         }
         MIOpNo += DestOperand.MINumOperands;
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index f851d9a79870..8b59411c5bc3 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -46,7 +46,9 @@ public:
   /// Get the human-readable name for the bank.
   StringRef getName() const { return TheDef.getValueAsString("Name"); }
   /// Get the name of the enumerator in the ID enumeration.
-  std::string getEnumeratorName() const { return (TheDef.getName() + "ID").str(); }
+  std::string getEnumeratorName() const {
+    return (TheDef.getName() + "ID").str();
+  }
 
   /// Get the name of the array holding the register class coverage data;
   std::string getCoverageArrayName() const {
@@ -212,8 +214,7 @@ static void visitRegisterBankClasses(
 }
 
 void RegisterBankEmitter::emitBaseClassImplementation(
-    raw_ostream &OS, StringRef TargetName,
-    std::vector<RegisterBank> &Banks) {
+    raw_ostream &OS, StringRef TargetName, std::vector<RegisterBank> &Banks) {
   const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank();
   const CodeGenHwModes &CGH = Target.getHwModes();
 
@@ -229,7 +230,8 @@ void RegisterBankEmitter::emitBaseClassImplementation(
     OS << "const uint32_t " << Bank.getCoverageArrayName() << "[] = {\n";
     unsigned LowestIdxInWord = 0;
     for (const auto &RCs : RCsGroupedByWord) {
-      OS << "    // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31) << "\n";
+      OS << "    // " << LowestIdxInWord << "-" << (LowestIdxInWord + 31)
+         << "\n";
       for (const auto &RC : RCs) {
         OS << "    (1u << (" << RC->getQualifiedIdName() << " - "
            << LowestIdxInWord << ")) |\n";
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index cff97771b7a8..8919e07a7547 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -101,8 +101,8 @@ private:
 } // end anonymous namespace
 
 // runEnums - Print out enum values for all of the registers.
-void RegisterInfoEmitter::runEnums(raw_ostream &OS,
-                                   CodeGenTarget &Target, CodeGenRegBank &Bank) {
+void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target,
+                                   CodeGenRegBank &Bank) {
   const auto &Registers = Bank.getRegisters();
 
   // Register enums are stored as uint16_t in the tables. Make sure we'll fit.
@@ -129,7 +129,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "  " << Reg.getName() << " = " << Reg.EnumValue << ",\n";
   assert(Registers.size() == Registers.back().EnumValue &&
          "Register enum value mismatch!");
-  OS << "  NUM_TARGET_REGS // " << Registers.size()+1 << "\n";
+  OS << "  NUM_TARGET_REGS // " << Registers.size() + 1 << "\n";
   OS << "};\n";
   if (!Namespace.empty())
     OS << "} // end namespace " << Namespace << "\n";
@@ -152,7 +152,8 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
       OS << "} // end namespace " << Namespace << "\n\n";
   }
 
-  const std::vector<Record*> &RegAltNameIndices = Target.getRegAltNameIndices();
+  const std::vector<Record *> &RegAltNameIndices =
+      Target.getRegAltNameIndices();
   // If the only definition is the default NoRegAltName, we don't need to
   // emit anything.
   if (RegAltNameIndices.size() > 1) {
@@ -188,7 +189,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
     OS << "namespace " << Namespace << " {\n";
   OS << "enum RegisterPressureSets {\n";
   unsigned NumSets = Bank.getNumRegPressureSets();
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = Bank.getRegSetAt(i);
     OS << "  " << RegUnits.Name << " = " << i << ",\n";
   }
@@ -201,13 +202,11 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
   OS << "#endif // GET_REGINFO_ENUM\n\n";
 }
 
-static void printInt(raw_ostream &OS, int Val) {
-  OS << Val;
-}
+static void printInt(raw_ostream &OS, int Val) { OS << Val; }
 
-void RegisterInfoEmitter::
-EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
-                    const std::string &ClassName) {
+void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
+                                              const CodeGenRegBank &RegBank,
+                                              const std::string &ClassName) {
   unsigned NumRCs = RegBank.getRegClasses().size();
   unsigned NumSets = RegBank.getNumRegPressureSets();
 
@@ -254,8 +253,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
     }
     OS << "};\n"
        << "  return RUWeightTable[RegUnit];\n";
-  }
-  else {
+  } else {
     OS << "  // All register units have unit weight.\n"
        << "  return 1;\n";
   }
@@ -271,7 +269,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "getRegPressureSetName(unsigned Idx) const {\n"
      << "  static const char *PressureNameTable[] = {\n";
   unsigned MaxRegUnitWeight = 0;
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
     MaxRegUnitWeight = std::max(MaxRegUnitWeight, RegUnits.Weight);
     OS << "    \"" << RegUnits.Name << "\",\n";
@@ -287,10 +285,10 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
         "{\n"
      << "  static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
      << " PressureLimitTable[] = {\n";
-  for (unsigned i = 0; i < NumSets; ++i ) {
+  for (unsigned i = 0; i < NumSets; ++i) {
     const RegUnitSet &RegUnits = RegBank.getRegSetAt(i);
-    OS << "    " << RegUnits.Weight << ",  \t// " << i << ": "
-       << RegUnits.Name << "\n";
+    OS << "    " << RegUnits.Weight << ",  \t// " << i << ": " << RegUnits.Name
+       << "\n";
   }
   OS << "  };\n"
      << "  return PressureLimitTable[Idx];\n"
@@ -353,7 +351,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
      << "}\n\n";
 }
 
-using DwarfRegNumsMapPair = std::pair<Record*, std::vector<int64_t>>;
+using DwarfRegNumsMapPair = std::pair<Record *, std::vector<int64_t>>;
 using DwarfRegNumsVecTy = std::vector<DwarfRegNumsMapPair>;
 
 static void finalizeDwarfRegNumsKeys(DwarfRegNumsVecTy &DwarfRegNums) {
@@ -419,7 +417,7 @@ void RegisterInfoEmitter::EmitRegMappingTables(
 
         // Store the mapping sorted by the LLVM reg num so lookup can be done
         // with a binary search.
-        std::map<uint64_t, Record*> Dwarf2LMap;
+        std::map<uint64_t, Record *> Dwarf2LMap;
         for (auto &DwarfRegNum : DwarfRegNums) {
           int DwarfRegNo = DwarfRegNum.second[I];
           if (DwarfRegNo < 0)
@@ -531,8 +529,8 @@ void RegisterInfoEmitter::EmitRegMapping(
     else
       OS << "EHFlavour";
     OS << ") {\n"
-     << "  default:\n"
-     << "    llvm_unreachable(\"Unknown DWARF flavour\");\n";
+       << "  default:\n"
+       << "    llvm_unreachable(\"Unknown DWARF flavour\");\n";
 
     for (unsigned i = 0, e = maxLength; i != e; ++i) {
       OS << "  case " << i << ":\n";
@@ -540,14 +538,14 @@ void RegisterInfoEmitter::EmitRegMapping(
       if (!isCtor)
         OS << "RI->";
       std::string Tmp;
-      raw_string_ostream(Tmp) << Namespace
-                              << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-                              << "Dwarf2L";
+      raw_string_ostream(Tmp)
+          << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
+          << "Dwarf2L";
       OS << "mapDwarfRegsToLLVMRegs(" << Tmp << ", " << Tmp << "Size, ";
       if (j == 0)
-          OS << "false";
-        else
-          OS << "true";
+        OS << "false";
+      else
+        OS << "true";
       OS << ");\n";
       OS << "    break;\n";
     }
@@ -571,14 +569,14 @@ void RegisterInfoEmitter::EmitRegMapping(
       if (!isCtor)
         OS << "RI->";
       std::string Tmp;
-      raw_string_ostream(Tmp) << Namespace
-                              << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-                              << "L2Dwarf";
+      raw_string_ostream(Tmp)
+          << Namespace << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
+          << "L2Dwarf";
       OS << "mapLLVMRegsToDwarfRegs(" << Tmp << ", " << Tmp << "Size, ";
       if (j == 0)
-          OS << "false";
-        else
-          OS << "true";
+        OS << "false";
+      else
+        OS << "true";
       OS << ");\n";
       OS << "    break;\n";
     }
@@ -588,8 +586,7 @@ void RegisterInfoEmitter::EmitRegMapping(
 
 // Print a BitVector as a sequence of hex numbers using a little-endian mapping.
 // Width is the number of bits per hex number.
-static void printBitVectorAsHex(raw_ostream &OS,
-                                const BitVector &Bits,
+static void printBitVectorAsHex(raw_ostream &OS, const BitVector &Bits,
                                 unsigned Width) {
   assert(Width <= 32 && "Width too large");
   unsigned Digits = (Width + 3) / 4;
@@ -604,16 +601,15 @@ static void printBitVectorAsHex(raw_ostream &OS,
 // Helper to emit a set of bits into a constant byte array.
 class BitVectorEmitter {
   BitVector Values;
+
 public:
   void add(unsigned v) {
     if (v >= Values.size())
-      Values.resize(((v/8)+1)*8); // Round up to the next byte.
+      Values.resize(((v / 8) + 1) * 8); // Round up to the next byte.
     Values[v] = true;
   }
 
-  void print(raw_ostream &OS) {
-    printBitVectorAsHex(OS, Values, 8);
-  }
+  void print(raw_ostream &OS) { printBitVectorAsHex(OS, Values, 8); }
 };
 
 static void printSimpleValueType(raw_ostream &OS, MVT::SimpleValueType VT) {
@@ -650,9 +646,8 @@ static DiffVec &diffEncode(DiffVec &V, SparseBitVector<> List) {
   return V;
 }
 
-template<typename Iter>
-static
-DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) {
+template <typename Iter>
+static DiffVec &diffEncode(DiffVec &V, unsigned InitVal, Iter Begin, Iter End) {
   assert(V.empty() && "Clear DiffVec before diffEncode.");
   unsigned Val = InitVal;
   for (Iter I = Begin; I != End; ++I) {
@@ -672,7 +667,7 @@ static void printMask(raw_ostream &OS, LaneBitmask Val) {
 // Try to combine Idx's compose map into Vec if it is compatible.
 // Return false if it's not possible.
 static bool combine(const CodeGenSubRegIndex *Idx,
-                    SmallVectorImpl<CodeGenSubRegIndex*> &Vec) {
+                    SmallVectorImpl<CodeGenSubRegIndex *> &Vec) {
   const CodeGenSubRegIndex::CompMap &Map = Idx->getComposites();
   for (const auto &I : Map) {
     CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1];
@@ -683,17 +678,15 @@ static bool combine(const CodeGenSubRegIndex *Idx,
   // All entries are compatible. Make it so.
   for (const auto &I : Map) {
     auto *&Entry = Vec[I.first->EnumValue - 1];
-    assert((!Entry || Entry == I.second) &&
-           "Expected EnumValue to be unique");
+    assert((!Entry || Entry == I.second) && "Expected EnumValue to be unique");
     Entry = I.second;
   }
   return true;
 }
 
-void
-RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
-                                              CodeGenRegBank &RegBank,
-                                              const std::string &ClName) {
+void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
+                                                   CodeGenRegBank &RegBank,
+                                                   const std::string &ClName) {
   const auto &SubRegIndices = RegBank.getSubRegIndices();
   OS << "unsigned " << ClName
      << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n";
@@ -707,7 +700,7 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
 
   // Map each Sub-register index to a compatible table row.
   SmallVector<unsigned, 4> RowMap;
-  SmallVector<SmallVector<CodeGenSubRegIndex*, 4>, 4> Rows;
+  SmallVector<SmallVector<CodeGenSubRegIndex *, 4>, 4> Rows;
 
   auto SubRegIndicesSize =
       std::distance(SubRegIndices.begin(), SubRegIndices.end());
@@ -760,10 +753,8 @@ RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   OS << "}\n\n";
 }
 
-void
-RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
-                                                    CodeGenRegBank &RegBank,
-                                                    const std::string &ClName) {
+void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(
+    raw_ostream &OS, CodeGenRegBank &RegBank, const std::string &ClName) {
   // See the comments in computeSubRegLaneMasks() for our goal here.
   const auto &SubRegIndices = RegBank.getSubRegIndices();
 
@@ -771,8 +762,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   SmallVector<unsigned, 4> SubReg2SequenceIndexMap;
   SmallVector<SmallVector<MaskRolPair, 1>, 4> Sequences;
   for (const auto &Idx : SubRegIndices) {
-    const SmallVector<MaskRolPair, 1> &IdxSequence
-      = Idx.CompositionLaneMaskTransform;
+    const SmallVector<MaskRolPair, 1> &IdxSequence =
+        Idx.CompositionLaneMaskTransform;
 
     unsigned Found = ~0u;
     unsigned SIdx = 0;
@@ -807,7 +798,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
       OS << format(", %2u }, ", P.RotateLeft);
     }
     OS << "{ LaneBitmask::getNone(), 0 }";
-    if (s+1 != se)
+    if (s + 1 != se)
       OS << ", ";
     OS << "  // Sequence " << Idx << "\n";
     Idx += Sequence.size() + 1;
@@ -820,7 +811,7 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   for (size_t i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << "    ";
     OS << SubReg2SequenceIndexMap[i];
-    if (i+1 != e)
+    if (i + 1 != e)
       OS << ",";
     OS << " // to " << SubRegIndices[i].getName() << "\n";
   }
@@ -829,15 +820,18 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   OS << "LaneBitmask " << ClName
      << "::composeSubRegIndexLaneMaskImpl(unsigned IdxA, LaneBitmask LaneMask)"
         " const {\n"
-        "  --IdxA; assert(IdxA < " << SubRegIndices.size()
+        "  --IdxA; assert(IdxA < "
+     << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
         "  for (const MaskRolOp *Ops =\n"
         "       &LaneMaskComposeSequences[CompositeSequences[IdxA]];\n"
         "       Ops->Mask.any(); ++Ops) {\n"
-        "    LaneBitmask::Type M = LaneMask.getAsInteger() & Ops->Mask.getAsInteger();\n"
+        "    LaneBitmask::Type M = LaneMask.getAsInteger() & "
+        "Ops->Mask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
-        "      Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - S)));\n"
+        "      Result |= LaneBitmask((M << S) | (M >> (LaneBitmask::BitWidth - "
+        "S)));\n"
         "    else\n"
         "      Result |= LaneBitmask(M);\n"
         "  }\n"
@@ -848,7 +842,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
      << "::reverseComposeSubRegIndexLaneMaskImpl(unsigned IdxA, "
         " LaneBitmask LaneMask) const {\n"
         "  LaneMask &= getSubRegIndexLaneMask(IdxA);\n"
-        "  --IdxA; assert(IdxA < " << SubRegIndices.size()
+        "  --IdxA; assert(IdxA < "
+     << SubRegIndices.size()
      << " && \"Subregister index out of bounds\");\n"
         "  LaneBitmask Result;\n"
         "  for (const MaskRolOp *Ops =\n"
@@ -856,7 +851,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
         "       Ops->Mask.any(); ++Ops) {\n"
         "    LaneBitmask::Type M = LaneMask.getAsInteger();\n"
         "    if (unsigned S = Ops->RotateLeft)\n"
-        "      Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - S)));\n"
+        "      Result |= LaneBitmask((M >> S) | (M << (LaneBitmask::BitWidth - "
+        "S)));\n"
         "    else\n"
         "      Result |= LaneBitmask(M);\n"
         "  }\n"
@@ -867,9 +863,8 @@ RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
 //
 // runMCDesc - Print out MC register descriptions.
 //
-void
-RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
-                               CodeGenRegBank &RegBank) {
+void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
+                                    CodeGenRegBank &RegBank) {
   emitSourceFileHeader("MC Register Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_MC_DESC\n";
@@ -880,7 +875,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   auto &SubRegIndices = RegBank.getSubRegIndices();
   // The lists of sub-registers and super-registers go in the same array.  That
   // allows us to share suffixes.
-  typedef std::vector<const CodeGenRegister*> RegVec;
+  typedef std::vector<const CodeGenRegister *> RegVec;
 
   // Differentially encoded lists.
   SequenceToOffsetTable<DiffVec> DiffSeqs;
@@ -894,7 +889,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
-  typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
+  typedef SmallVector<const CodeGenSubRegIndex *, 4> SubRegIdxVec;
   SequenceToOffsetTable<SubRegIdxVec, deref<std::less<>>> SubRegIdxSeqs;
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
 
@@ -907,7 +902,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     RegStrings.add(std::string(Reg.getName()));
 
     // Compute the ordered sub-register list.
-    SetVector<const CodeGenRegister*> SR;
+    SetVector<const CodeGenRegister *> SR;
     Reg.addSubRegsPreOrder(SR, RegBank);
     diffEncode(SubRegLists[i], Reg.EnumValue, SR.begin(), SR.end());
     DiffSeqs.add(SubRegLists[i]);
@@ -961,8 +956,8 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "};\n\n";
 
   // Emit the table of sub-register index sizes.
-  OS << "extern const MCRegisterInfo::SubRegCoveredBits "
-     << TargetName << "SubRegIdxRanges[] = {\n";
+  OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName
+     << "SubRegIdxRanges[] = {\n";
   OS << "  { " << (uint16_t)-1 << ", " << (uint16_t)-1 << " },\n";
   for (const auto &Idx : SubRegIndices) {
     OS << "  { " << Idx.Offset << ", " << Idx.Size << " },\t// "
@@ -995,13 +990,13 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
        << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << " },\n";
     ++i;
   }
-  OS << "};\n\n";      // End of register descriptors...
+  OS << "};\n\n"; // End of register descriptors...
 
   // Emit the table of register unit roots. Each regunit has one or two root
   // registers.
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2] = {\n";
   for (unsigned i = 0, e = RegBank.getNumNativeRegUnits(); i != e; ++i) {
-    ArrayRef<const CodeGenRegister*> Roots = RegBank.getRegUnit(i).getRoots();
+    ArrayRef<const CodeGenRegister *> Roots = RegBank.getRegUnit(i).getRoots();
     assert(!Roots.empty() && "All regunits must have a root register.");
     assert(Roots.size() <= 2 && "More than two roots not supported yet.");
     OS << "  { ";
@@ -1021,7 +1016,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit the register enum value arrays for each RegisterClass
   for (const auto &RC : RegisterClasses) {
-    ArrayRef<Record*> Order = RC.getOrder();
+    ArrayRef<Record *> Order = RC.getOrder();
 
     // Give the register class a legal C name if it's anonymous.
     const std::string &Name = RC.getName();
@@ -1092,7 +1087,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     }
     OS << "  " << Value << ",\n";
   }
-  OS << "};\n";       // End of HW encoding table
+  OS << "};\n"; // End of HW encoding table
 
   // MCRegisterInfo initialization routine.
   OS << "static inline void Init" << TargetName
@@ -1117,9 +1112,9 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "#endif // GET_REGINFO_MC_DESC\n\n";
 }
 
-void
-RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
-                                     CodeGenRegBank &RegBank) {
+void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS,
+                                          CodeGenTarget &Target,
+                                          CodeGenRegBank &RegBank) {
   emitSourceFileHeader("Register Information Header Fragment", OS);
 
   OS << "\n#ifdef GET_REGINFO_HEADER\n";
@@ -1175,8 +1170,10 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "      const MachineFunction &MF);\n";
 
   const auto &RegisterClasses = RegBank.getRegClasses();
-  if (llvm::any_of(RegisterClasses, [](const auto &RC) { return RC.getBaseClassOrder(); })) {
-    OS << "  const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) const override;\n";
+  if (llvm::any_of(RegisterClasses,
+                   [](const auto &RC) { return RC.getBaseClassOrder(); })) {
+    OS << "  const TargetRegisterClass *getPhysRegBaseClass(MCRegister Reg) "
+          "const override;\n";
   }
 
   OS << "};\n\n";
@@ -1200,9 +1197,8 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
 //
 // runTargetDesc - Output the target register and register file descriptions.
 //
-void
-RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
-                                   CodeGenRegBank &RegBank){
+void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
+                                        CodeGenRegBank &RegBank) {
   emitSourceFileHeader("Target Register and Register Classes Information", OS);
 
   OS << "\n#ifdef GET_REGINFO_TARGET_DESC\n";
@@ -1219,11 +1215,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   const auto &SubRegIndices = RegBank.getSubRegIndices();
 
   // Collect all registers belonging to any allocatable class.
-  std::set<Record*> AllocatableRegs;
+  std::set<Record *> AllocatableRegs;
 
   // Collect allocatable registers.
   for (const auto &RC : RegisterClasses) {
-    ArrayRef<Record*> Order = RC.getOrder();
+    ArrayRef<Record *> Order = RC.getOrder();
 
     if (RC.Allocatable)
       AllocatableRegs.insert(Order.begin(), Order.end());
@@ -1297,7 +1293,6 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     }
     OS << "};\n";
 
-
     OS << "\nstatic const TargetRegisterClass *const "
        << "NullRegClasses[] = { nullptr };\n\n";
 
@@ -1320,7 +1315,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Every bit mask present in the list has at least one bit set.
 
     // Compress the sub-reg index lists.
-    typedef std::vector<const CodeGenSubRegIndex*> IdxList;
+    typedef std::vector<const CodeGenSubRegIndex *> IdxList;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
     SequenceToOffsetTable<IdxList, deref<std::less<>>> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
@@ -1354,14 +1349,14 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
     // Emit NULL terminated super-class lists.
     for (const auto &RC : RegisterClasses) {
-      ArrayRef<CodeGenRegisterClass*> Supers = RC.getSuperClasses();
+      ArrayRef<CodeGenRegisterClass *> Supers = RC.getSuperClasses();
 
       // Skip classes without supers.  We can reuse NullRegClasses.
       if (Supers.empty())
         continue;
 
-      OS << "static const TargetRegisterClass *const "
-         << RC.getName() << "Superclasses[] = {\n";
+      OS << "static const TargetRegisterClass *const " << RC.getName()
+         << "Superclasses[] = {\n";
       for (const auto *Super : Supers)
         OS << "  &" << Super->getQualifiedName() << "RegClass,\n";
       OS << "  nullptr\n};\n\n";
@@ -1371,12 +1366,12 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     for (const auto &RC : RegisterClasses) {
       if (!RC.AltOrderSelect.empty()) {
         OS << "\nstatic inline unsigned " << RC.getName()
-           << "AltOrderSelect(const MachineFunction &MF) {"
-           << RC.AltOrderSelect << "}\n\n"
+           << "AltOrderSelect(const MachineFunction &MF) {" << RC.AltOrderSelect
+           << "}\n\n"
            << "static ArrayRef<MCPhysReg> " << RC.getName()
            << "GetRawAllocationOrder(const MachineFunction &MF) {\n";
-        for (unsigned oi = 1 , oe = RC.getNumOrders(); oi != oe; ++oi) {
-          ArrayRef<Record*> Elems = RC.getOrder(oi);
+        for (unsigned oi = 1, oe = RC.getNumOrders(); oi != oe; ++oi) {
+          ArrayRef<Record *> Elems = RC.getOrder(oi);
           if (!Elems.empty()) {
             OS << "  static const MCPhysReg AltOrder" << oi << "[] = {";
             for (unsigned elem = 0; elem != Elems.size(); ++elem)
@@ -1556,8 +1551,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
           EnumValue = SubRegClass->EnumValue + 1;
         }
 
-        OS << "      " << EnumValue << ",\t// "
-           << RC.getName() << ':' << Idx.getName();
+        OS << "      " << EnumValue << ",\t// " << RC.getName() << ':'
+           << Idx.getName();
 
         if (MatchingSubClass) {
           CodeGenRegisterClass *SubRegClass = MatchingSubClass->second;
@@ -1581,7 +1576,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Emit register base class mapper
   if (!RegisterClasses.empty()) {
     // Collect base classes
-    SmallVector<const CodeGenRegisterClass*> BaseClasses;
+    SmallVector<const CodeGenRegisterClass *> BaseClasses;
     for (const auto &RC : RegisterClasses) {
       if (RC.getBaseClassOrder())
         BaseClasses.push_back(&RC);
@@ -1592,9 +1587,10 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
       // Apply order
       struct BaseClassOrdering {
-        bool operator()(const CodeGenRegisterClass *LHS, const CodeGenRegisterClass *RHS) const {
-          return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue)
-               < std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue);
+        bool operator()(const CodeGenRegisterClass *LHS,
+                        const CodeGenRegisterClass *RHS) const {
+          return std::pair(*LHS->getBaseClassOrder(), LHS->EnumValue) <
+                 std::pair(*RHS->getBaseClassOrder(), RHS->EnumValue);
         }
       };
       llvm::stable_sort(BaseClasses, BaseClassOrdering());
@@ -1638,8 +1634,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "extern const char " << TargetName << "RegClassStrings[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n";
   OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n";
-  OS << "extern const MCRegisterInfo::SubRegCoveredBits "
-     << TargetName << "SubRegIdxRanges[];\n";
+  OS << "extern const MCRegisterInfo::SubRegCoveredBits " << TargetName
+     << "SubRegIdxRanges[];\n";
   OS << "extern const uint16_t " << TargetName << "RegEncodingTable[];\n";
 
   EmitRegMappingTables(OS, Regs, true);
@@ -1673,16 +1669,15 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "}\n\n";
 
   // Emit CalleeSavedRegs information.
-  std::vector<Record*> CSRSets =
-    Records.getAllDerivedDefinitions("CalleeSavedRegs");
+  std::vector<Record *> CSRSets =
+      Records.getAllDerivedDefinitions("CalleeSavedRegs");
   for (unsigned i = 0, e = CSRSets.size(); i != e; ++i) {
     Record *CSRSet = CSRSets[i];
     const SetTheory::RecVec *Regs = RegBank.getSets().expand(CSRSet);
     assert(Regs && "Cannot expand CalleeSavedRegs instance");
 
     // Emit the *_SaveList list of callee-saved registers.
-    OS << "static const MCPhysReg " << CSRSet->getName()
-       << "_SaveList[] = { ";
+    OS << "static const MCPhysReg " << CSRSet->getName() << "_SaveList[] = { ";
     for (unsigned r = 0, re = Regs->size(); r != re; ++r)
       OS << getQualifiedName((*Regs)[r]) << ", ";
     OS << "0 };\n";
@@ -1693,11 +1688,11 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Check for an optional OtherPreserved set.
     // Add those registers to RegMask, but not to SaveList.
     if (DagInit *OPDag =
-        dyn_cast<DagInit>(CSRSet->getValueInit("OtherPreserved"))) {
+            dyn_cast<DagInit>(CSRSet->getValueInit("OtherPreserved"))) {
       SetTheory::RecSet OPSet;
       RegBank.getSets().evaluate(OPDag, OPSet, CSRSet->getLoc());
       Covered |= RegBank.computeCoveredRegisters(
-        ArrayRef<Record*>(OPSet.begin(), OPSet.end()));
+          ArrayRef<Record *>(OPSet.begin(), OPSet.end()));
     }
 
     // Add all constant physical registers to the preserved mask:
@@ -1709,8 +1704,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     Covered |= RegBank.computeCoveredRegisters(
         ArrayRef<Record *>(ConstantSet.begin(), ConstantSet.end()));
 
-    OS << "static const uint32_t " << CSRSet->getName()
-       << "_RegMask[] = { ";
+    OS << "static const uint32_t " << CSRSet->getName() << "_RegMask[] = { ";
     printBitVectorAsHex(OS, Covered, 32);
     OS << "};\n";
   }
@@ -1795,7 +1789,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   }
   OS << "}\n\n";
 
-  OS << "const " << TargetName << "FrameLowering *\n" << TargetName
+  OS << "const " << TargetName << "FrameLowering *\n"
+     << TargetName
      << "GenRegisterInfo::getFrameLowering(const MachineFunction &MF) {\n"
      << "  return static_cast<const " << TargetName << "FrameLowering *>(\n"
      << "      MF.getSubtarget().getFrameLowering());\n"
@@ -1827,7 +1822,7 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
   CodeGenRegBank &RegBank = Target.getRegBank();
   const CodeGenHwModes &CGH = Target.getHwModes();
   unsigned NumModes = CGH.getNumModeIds();
-  auto getModeName = [CGH] (unsigned M) -> StringRef {
+  auto getModeName = [CGH](unsigned M) -> StringRef {
     if (M == 0)
       return "Default";
     return CGH.getMode(M).Name;
@@ -1883,9 +1878,10 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
     OS << '\n';
     OS << "\tCoveredBySubregs: " << R.CoveredBySubRegs << '\n';
     OS << "\tHasDisjunctSubRegs: " << R.HasDisjunctSubRegs << '\n';
-    for (std::pair<CodeGenSubRegIndex*,CodeGenRegister*> P : R.getSubRegs()) {
-      OS << "\tSubReg " << P.first->getName()
-         << " = " << P.second->getName() << '\n';
+    for (std::pair<CodeGenSubRegIndex *, CodeGenRegister *> P :
+         R.getSubRegs()) {
+      OS << "\tSubReg " << P.first->getName() << " = " << P.second->getName()
+         << '\n';
     }
   }
 }
diff --git a/llvm/utils/TableGen/SDNodeProperties.h b/llvm/utils/TableGen/SDNodeProperties.h
index 66a04e63150c..571542328524 100644
--- a/llvm/utils/TableGen/SDNodeProperties.h
+++ b/llvm/utils/TableGen/SDNodeProperties.h
@@ -34,6 +34,6 @@ enum SDNP {
 
 unsigned parseSDPatternOperatorProperties(Record *R);
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index d75a9e95a7eb..0cce798a27b4 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -134,7 +134,7 @@ private:
                         Twine("Entry for field '") + Field.Name + "' is null");
       return std::string(Entry->first);
     }
-    PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name + 
+    PrintFatalError(Loc, Twine("invalid field type for field '") + Field.Name +
                              "'; expected: bit, bits, string, or code");
   }
 
@@ -173,7 +173,7 @@ private:
         return "uint32_t";
       if (NumBits <= 64)
         return "uint64_t";
-      PrintFatalError(Index.Loc, Twine("In table '") + Table.Name + 
+      PrintFatalError(Index.Loc, Twine("In table '") + Table.Name +
                                      "' lookup method '" + Index.Name +
                                      "', key field '" + Field.Name +
                                      "' of type bits is too large");
@@ -425,7 +425,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
   OS << "  struct KeyType {\n";
   for (const auto &Field : Index.Fields) {
-    OS << "    " << searchableFieldType(Table, Index, Field, TypeInTempStruct) 
+    OS << "    " << searchableFieldType(Table, Index, Field, TypeInTempStruct)
        << " " << Field.Name << ";\n";
   }
   OS << "  };\n";
@@ -436,7 +436,7 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
     if (isa<StringRecTy>(Field.RecType)) {
       OS << ".upper()";
       if (IsPrimary)
-        PrintFatalError(Index.Loc, 
+        PrintFatalError(Index.Loc,
                         Twine("In table '") + Table.Name +
                             "', use a secondary lookup method for "
                             "case-insensitive comparison of field '" +
@@ -580,7 +580,7 @@ std::unique_ptr<SearchIndex> SearchableTableEmitter::parseSearchIndex(
           Twine("In table '") + Table.Name +
               "', 'PrimaryKey' or 'Key' refers to nonexistent field '" +
               FieldName + "'");
-                      
+
     Index->Fields.push_back(*Field);
   }
 
@@ -643,11 +643,11 @@ void SearchableTableEmitter::collectTableEntries(
       } else {
         RecTy *Ty = resolveTypes(Field.RecType, TI->getType());
         if (!Ty)
-          PrintFatalError(EntryRec->getValue(Field.Name), 
+          PrintFatalError(EntryRec->getValue(Field.Name),
                           Twine("Field '") + Field.Name + "' of table '" +
-                          Table.Name + "' entry has incompatible type: " +
-                          TI->getType()->getAsString() + " vs. " +
-                          Field.RecType->getAsString());
+                              Table.Name + "' entry has incompatible type: " +
+                              TI->getType()->getAsString() + " vs. " +
+                              Field.RecType->getAsString());
         Field.RecType = Ty;
       }
     }
@@ -702,7 +702,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     StringRef FilterClass = EnumRec->getValueAsString("FilterClass");
     Enum->Class = Records.getClass(FilterClass);
     if (!Enum->Class)
-      PrintFatalError(EnumRec->getValue("FilterClass"), 
+      PrintFatalError(EnumRec->getValue("FilterClass"),
                       Twine("Enum FilterClass '") + FilterClass +
                           "' does not exist");
 
@@ -723,11 +723,13 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     for (const auto &FieldName : Fields) {
       Table->Fields.emplace_back(FieldName); // Construct a GenericField.
 
-      if (auto TypeOfRecordVal = TableRec->getValue(("TypeOf_" + FieldName).str())) {
-        if (!parseFieldType(Table->Fields.back(), TypeOfRecordVal->getValue())) {
-          PrintError(TypeOfRecordVal, 
-                     Twine("Table '") + Table->Name +
-                         "' has invalid 'TypeOf_" + FieldName +
+      if (auto TypeOfRecordVal =
+              TableRec->getValue(("TypeOf_" + FieldName).str())) {
+        if (!parseFieldType(Table->Fields.back(),
+                            TypeOfRecordVal->getValue())) {
+          PrintError(TypeOfRecordVal,
+                     Twine("Table '") + Table->Name + "' has invalid 'TypeOf_" +
+                         FieldName +
                          "': " + TypeOfRecordVal->getValue()->getAsString());
           PrintFatalNote("The 'TypeOf_xxx' field must be a string naming a "
                          "GenericEnum record, or \"code\"");
@@ -737,9 +739,9 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
 
     StringRef FilterClass = TableRec->getValueAsString("FilterClass");
     if (!Records.getClass(FilterClass))
-      PrintFatalError(TableRec->getValue("FilterClass"), 
-                      Twine("Table FilterClass '") +
-                          FilterClass + "' does not exist");
+      PrintFatalError(TableRec->getValue("FilterClass"),
+                      Twine("Table FilterClass '") + FilterClass +
+                          "' does not exist");
 
     RecordVal *FilterClassFieldVal = TableRec->getValue("FilterClassField");
     std::vector<Record *> Definitions =
@@ -779,14 +781,14 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     Record *TableRec = IndexRec->getValueAsDef("Table");
     auto It = TableMap.find(TableRec);
     if (It == TableMap.end())
-      PrintFatalError(IndexRec->getValue("Table"), 
+      PrintFatalError(IndexRec->getValue("Table"),
                       Twine("SearchIndex '") + IndexRec->getName() +
                           "' refers to nonexistent table '" +
                           TableRec->getName());
 
     GenericTable &Table = *It->second;
     Table.Indices.push_back(
-        parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(), 
+        parseSearchIndex(Table, IndexRec->getValue("Key"), IndexRec->getName(),
                          IndexRec->getValueAsListOfStrings("Key"),
                          IndexRec->getValueAsBit("EarlyOut")));
   }
diff --git a/llvm/utils/TableGen/SequenceToOffsetTable.h b/llvm/utils/TableGen/SequenceToOffsetTable.h
index 77a404d07b7d..7db39a9af720 100644
--- a/llvm/utils/TableGen/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/SequenceToOffsetTable.h
@@ -44,7 +44,7 @@ static inline void printChar(raw_ostream &OS, char C) {
 ///
 /// @tparam SeqT The sequence container. (vector or string).
 /// @tparam Less A stable comparator for SeqT elements.
-template<typename SeqT, typename Less = std::less<typename SeqT::value_type> >
+template <typename SeqT, typename Less = std::less<typename SeqT::value_type>>
 class SequenceToOffsetTable {
   typedef typename SeqT::value_type ElemT;
 
@@ -53,8 +53,8 @@ class SequenceToOffsetTable {
   struct SeqLess {
     Less L;
     bool operator()(const SeqT &A, const SeqT &B) const {
-      return std::lexicographical_compare(A.rbegin(), A.rend(),
-                                          B.rbegin(), B.rend(), L);
+      return std::lexicographical_compare(A.rbegin(), A.rend(), B.rbegin(),
+                                          B.rend(), L);
     }
   };
 
@@ -153,15 +153,15 @@ public:
 
   /// emit - Print out the table as the body of an array initializer.
   /// Use the Print function to print elements.
-  void emit(raw_ostream &OS,
-            void (*Print)(raw_ostream&, ElemT),
+  void emit(raw_ostream &OS, void (*Print)(raw_ostream &, ElemT),
             const char *Term = "0") const {
     assert((empty() || Entries) && "Call layout() before emit()");
     for (typename SeqMap::const_iterator I = Seqs.begin(), E = Seqs.end();
          I != E; ++I) {
       OS << "  /* " << I->second << " */ ";
       for (typename SeqT::const_iterator SI = I->first.begin(),
-             SE = I->first.end(); SI != SE; ++SI) {
+                                         SE = I->first.end();
+           SI != SE; ++SI) {
         Print(OS, *SI);
         OS << ", ";
       }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 39225182a4c2..2707f54eed6e 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -51,9 +51,9 @@ struct LessRecordFieldFieldName {
 };
 
 class SubtargetEmitter {
-  // Each processor has a SchedClassDesc table with an entry for each SchedClass.
-  // The SchedClassDesc table indexes into a global write resource table, write
-  // latency table, and read advance table.
+  // Each processor has a SchedClassDesc table with an entry for each
+  // SchedClass. The SchedClassDesc table indexes into a global write resource
+  // table, write latency table, and read advance table.
   struct SchedClassTables {
     std::vector<std::vector<MCSchedClassDesc>> ProcSchedClasses;
     std::vector<MCWriteProcResEntry> WriteProcResources;
@@ -89,20 +89,18 @@ class SubtargetEmitter {
                             const DenseMap<Record *, unsigned> &FeatureMap);
   unsigned CPUKeyValues(raw_ostream &OS,
                         const DenseMap<Record *, unsigned> &FeatureMap);
-  void FormItineraryStageString(const std::string &Names,
-                                Record *ItinData, std::string &ItinString,
-                                unsigned &NStages);
-  void FormItineraryOperandCycleString(Record *ItinData, std::string &ItinString,
+  void FormItineraryStageString(const std::string &Names, Record *ItinData,
+                                std::string &ItinString, unsigned &NStages);
+  void FormItineraryOperandCycleString(Record *ItinData,
+                                       std::string &ItinString,
                                        unsigned &NOperandCycles);
-  void FormItineraryBypassString(const std::string &Names,
-                                 Record *ItinData,
-                                 std::string &ItinString, unsigned NOperandCycles);
-  void EmitStageAndOperandCycleData(raw_ostream &OS,
-                                    std::vector<std::vector<InstrItinerary>>
-                                      &ProcItinLists);
+  void FormItineraryBypassString(const std::string &Names, Record *ItinData,
+                                 std::string &ItinString,
+                                 unsigned NOperandCycles);
+  void EmitStageAndOperandCycleData(
+      raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists);
   void EmitItineraries(raw_ostream &OS,
-                       std::vector<std::vector<InstrItinerary>>
-                         &ProcItinLists);
+                       std::vector<std::vector<InstrItinerary>> &ProcItinLists);
   unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
                                   raw_ostream &OS);
   void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
@@ -153,15 +151,16 @@ public:
 void SubtargetEmitter::Enumeration(raw_ostream &OS,
                                    DenseMap<Record *, unsigned> &FeatureMap) {
   // Get all records of class and sort
-  std::vector<Record*> DefList =
-    Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> DefList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
   llvm::sort(DefList, LessRecord());
 
   unsigned N = DefList.size();
   if (N == 0)
     return;
   if (N + 1 > MAX_SUBTARGET_FEATURES)
-    PrintFatalError("Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
+    PrintFatalError(
+        "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
 
   OS << "namespace " << Target << " {\n";
 
@@ -248,8 +247,8 @@ void SubtargetEmitter::EmitSubtargetInfoMacroCalls(raw_ostream &OS) {
 unsigned SubtargetEmitter::FeatureKeyValues(
     raw_ostream &OS, const DenseMap<Record *, unsigned> &FeatureMap) {
   // Gather and sort all the features
-  std::vector<Record*> FeatureList =
-                           Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> FeatureList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
 
   if (FeatureList.empty())
     return 0;
@@ -269,13 +268,14 @@ unsigned SubtargetEmitter::FeatureKeyValues(
     StringRef CommandLineName = Feature->getValueAsString("Name");
     StringRef Desc = Feature->getValueAsString("Desc");
 
-    if (CommandLineName.empty()) continue;
+    if (CommandLineName.empty())
+      continue;
 
-    // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in } }
+    // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
+    // } }
     OS << "  { "
        << "\"" << CommandLineName << "\", "
-       << "\"" << Desc << "\", "
-       << Target << "::" << Name << ", ";
+       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
 
     RecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -299,8 +299,8 @@ unsigned
 SubtargetEmitter::CPUKeyValues(raw_ostream &OS,
                                const DenseMap<Record *, unsigned> &FeatureMap) {
   // Gather and sort processor information
-  std::vector<Record*> ProcessorList =
-                          Records.getAllDerivedDefinitions("Processor");
+  std::vector<Record *> ProcessorList =
+      Records.getAllDerivedDefinitions("Processor");
   llvm::sort(ProcessorList, LessRecordFieldName());
 
   // Begin processor table
@@ -324,7 +324,7 @@ SubtargetEmitter::CPUKeyValues(raw_ostream &OS,
 
     // Emit the scheduler model pointer.
     const std::string &ProcModelName =
-      SchedModels.getModelForProc(Processor).ModelName;
+        SchedModels.getModelForProc(Processor).ModelName;
     OS << ", &" << ProcModelName << " },\n";
   }
 
@@ -363,7 +363,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
     for (unsigned j = 0, M = UnitList.size(); j < M;) {
       // Add name and bitwise or
       ItinString += Name + "FU::" + UnitList[j]->getName().str();
-      if (++j < M) ItinString += " | ";
+      if (++j < M)
+        ItinString += " | ";
     }
 
     int TimeInc = Stage->getValueAsInt("TimeInc");
@@ -374,7 +375,8 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
 
     // Close off stage
     ItinString += " }";
-    if (++i < N) ItinString += ", ";
+    if (++i < N)
+      ItinString += ", ";
   }
 }
 
@@ -383,11 +385,11 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name,
 // operand cycle initialization for the specified itinerary.  N is the
 // number of operands that has cycles specified.
 //
-void SubtargetEmitter::FormItineraryOperandCycleString(Record *ItinData,
-                         std::string &ItinString, unsigned &NOperandCycles) {
+void SubtargetEmitter::FormItineraryOperandCycleString(
+    Record *ItinData, std::string &ItinString, unsigned &NOperandCycles) {
   // Get operand cycle list
   std::vector<int64_t> OperandCycleList =
-    ItinData->getValueAsListOfInts("OperandCycles");
+      ItinData->getValueAsListOfInts("OperandCycles");
 
   // For each operand cycle
   NOperandCycles = OperandCycleList.size();
@@ -422,12 +424,10 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name,
 // cycle tables. Create a list of InstrItinerary objects (ProcItinLists) indexed
 // by CodeGenSchedClass::Index.
 //
-void SubtargetEmitter::
-EmitStageAndOperandCycleData(raw_ostream &OS,
-                             std::vector<std::vector<InstrItinerary>>
-                               &ProcItinLists) {
+void SubtargetEmitter::EmitStageAndOperandCycleData(
+    raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
   // Multiple processor models may share an itinerary record. Emit it once.
-  SmallPtrSet<Record*, 8> ItinsDefSet;
+  SmallPtrSet<Record *, 8> ItinsDefSet;
 
   // Emit functional units for all the itineraries.
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
@@ -452,30 +452,31 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
     RecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
     if (!BPs.empty()) {
       OS << "\n// Pipeline forwarding paths for itineraries \"" << Name
-         << "\"\n" << "namespace " << Name << "Bypass {\n";
+         << "\"\n"
+         << "namespace " << Name << "Bypass {\n";
 
       OS << "  const unsigned NoBypass = 0;\n";
       for (unsigned j = 0, BPN = BPs.size(); j < BPN; ++j)
-        OS << "  const unsigned " << BPs[j]->getName()
-           << " = 1 << " << j << ";\n";
+        OS << "  const unsigned " << BPs[j]->getName() << " = 1 << " << j
+           << ";\n";
 
       OS << "} // end namespace " << Name << "Bypass\n";
     }
   }
 
   // Begin stages table
-  std::string StageTable = "\nextern const llvm::InstrStage " + Target +
-                           "Stages[] = {\n";
+  std::string StageTable =
+      "\nextern const llvm::InstrStage " + Target + "Stages[] = {\n";
   StageTable += "  { 0, 0, 0, llvm::InstrStage::Required }, // No itinerary\n";
 
   // Begin operand cycle table
-  std::string OperandCycleTable = "extern const unsigned " + Target +
-    "OperandCycles[] = {\n";
+  std::string OperandCycleTable =
+      "extern const unsigned " + Target + "OperandCycles[] = {\n";
   OperandCycleTable += "  0, // No itinerary\n";
 
   // Begin pipeline bypass table
-  std::string BypassTable = "extern const unsigned " + Target +
-    "ForwardingPaths[] = {\n";
+  std::string BypassTable =
+      "extern const unsigned " + Target + "ForwardingPaths[] = {\n";
   BypassTable += " 0, // No itinerary\n";
 
   // For each Itinerary across all processors, add a unique entry to the stages,
@@ -485,11 +486,10 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
   std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
     // Add process itinerary to the list.
-    ProcItinLists.resize(ProcItinLists.size()+1);
+    std::vector<InstrItinerary> &ItinList = ProcItinLists.emplace_back();
 
     // If this processor defines no itineraries, then leave the itinerary list
     // empty.
-    std::vector<InstrItinerary> &ItinList = ProcItinLists.back();
     if (!ProcModel.hasItineraries())
       continue;
 
@@ -542,19 +542,20 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
       // Check to see if operand cycle already exists and create if it doesn't
       uint16_t FindOperandCycle = 0;
       if (NOperandCycles > 0) {
-        std::string ItinOperandString = ItinOperandCycleString+ItinBypassString;
+        std::string ItinOperandString =
+            ItinOperandCycleString + ItinBypassString;
         FindOperandCycle = ItinOperandMap[ItinOperandString];
         if (FindOperandCycle == 0) {
           // Emit as  cycle, // index
           OperandCycleTable += ItinOperandCycleString + ", // ";
           std::string OperandIdxComment = itostr(OperandCycleCount);
           if (NOperandCycles > 1)
-            OperandIdxComment += "-"
-              + itostr(OperandCycleCount + NOperandCycles - 1);
+            OperandIdxComment +=
+                "-" + itostr(OperandCycleCount + NOperandCycles - 1);
           OperandCycleTable += OperandIdxComment + "\n";
           // Record Itin class number.
-          ItinOperandMap[ItinOperandCycleString] =
-            FindOperandCycle = OperandCycleCount;
+          ItinOperandMap[ItinOperandCycleString] = FindOperandCycle =
+              OperandCycleCount;
           // Emit as bypass, // index
           BypassTable += ItinBypassString + ", // " + OperandIdxComment + "\n";
           OperandCycleCount += NOperandCycles;
@@ -599,17 +600,17 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
 // Itineraries for each processor. The Itinerary lists are indexed on
 // CodeGenSchedClass::Index.
 //
-void SubtargetEmitter::
-EmitItineraries(raw_ostream &OS,
-                std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
+void SubtargetEmitter::EmitItineraries(
+    raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
   // Multiple processor models may share an itinerary record. Emit it once.
-  SmallPtrSet<Record*, 8> ItinsDefSet;
+  SmallPtrSet<Record *, 8> ItinsDefSet;
 
   // For each processor's machine model
-  std::vector<std::vector<InstrItinerary>>::iterator
-      ProcItinListsIter = ProcItinLists.begin();
+  std::vector<std::vector<InstrItinerary>>::iterator ProcItinListsIter =
+      ProcItinLists.begin();
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) {
+                                    PE = SchedModels.procModelEnd();
+       PI != PE; ++PI, ++ProcItinListsIter) {
 
     Record *ItinsDef = PI->ItinsDef;
     if (!ItinsDefSet.insert(ItinsDef).second)
@@ -636,13 +637,10 @@ EmitItineraries(raw_ostream &OS,
 
       // Emit Itinerary in the form of
       // { firstStage, lastStage, firstCycle, lastCycle } // index
-      OS << "  { " <<
-        Intinerary.NumMicroOps << ", " <<
-        Intinerary.FirstStage << ", " <<
-        Intinerary.LastStage << ", " <<
-        Intinerary.FirstOperandCycle << ", " <<
-        Intinerary.LastOperandCycle << " }" <<
-        ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
+      OS << "  { " << Intinerary.NumMicroOps << ", " << Intinerary.FirstStage
+         << ", " << Intinerary.LastStage << ", " << Intinerary.FirstOperandCycle
+         << ", " << Intinerary.LastOperandCycle << " }"
+         << ", // " << j << " " << SchedModels.getSchedClass(j).Name << "\n";
     }
     // End processor itinerary table
     OS << "  { 0, uint16_t(~0U), uint16_t(~0U), uint16_t(~0U), uint16_t(~0U) }"
@@ -840,13 +838,11 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
         NumUnits += RU->getValueAsInt("NumUnits");
         SubUnitsOffset += RU->getValueAsInt("NumUnits");
       }
-    }
-    else {
+    } else {
       // Find the SuperIdx
       if (PRDef->getValueInit("Super")->isComplete()) {
-        SuperDef =
-            SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"),
-                                         ProcModel, PRDef->getLoc());
+        SuperDef = SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"),
+                                                ProcModel, PRDef->getLoc());
         SuperIdx = ProcModel.getProcResourceIdx(SuperDef);
       }
       NumUnits = PRDef->getValueAsInt("NumUnits");
@@ -862,7 +858,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
     } else {
       OS << "nullptr";
     }
-    OS << "}, // #" << i+1;
+    OS << "}, // #" << i + 1;
     if (SuperDef)
       OS << ", Super=" << SuperDef->getName();
     OS << "\n";
@@ -872,8 +868,9 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
 
 // Find the WriteRes Record that defines processor resources for this
 // SchedWrite.
-Record *SubtargetEmitter::FindWriteResources(
-  const CodeGenSchedRW &SchedWrite, const CodeGenProcModel &ProcModel) {
+Record *
+SubtargetEmitter::FindWriteResources(const CodeGenSchedRW &SchedWrite,
+                                     const CodeGenProcModel &ProcModel) {
 
   // Check if the SchedWrite is already subtarget-specific and directly
   // specifies a set of processor resources.
@@ -883,16 +880,18 @@ Record *SubtargetEmitter::FindWriteResources(
   Record *AliasDef = nullptr;
   for (Record *A : SchedWrite.Aliases) {
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
     if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
       Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
       if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                    "defined for processor " + ProcModel.ModelName +
-                    " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef && AliasDef->isSubClassOf("SchedWriteRes"))
@@ -903,12 +902,12 @@ Record *SubtargetEmitter::FindWriteResources(
   for (Record *WR : ProcModel.WriteResDefs) {
     if (!WR->isSubClassOf("WriteRes"))
       continue;
-    if (AliasDef == WR->getValueAsDef("WriteType")
-        || SchedWrite.TheDef == WR->getValueAsDef("WriteType")) {
+    if (AliasDef == WR->getValueAsDef("WriteType") ||
+        SchedWrite.TheDef == WR->getValueAsDef("WriteType")) {
       if (ResDef) {
         PrintFatalError(WR->getLoc(), "Resources are defined for both "
-                      "SchedWrite and its alias on processor " +
-                      ProcModel.ModelName);
+                                      "SchedWrite and its alias on processor " +
+                                          ProcModel.ModelName);
       }
       ResDef = WR;
     }
@@ -918,7 +917,7 @@ Record *SubtargetEmitter::FindWriteResources(
   if (!ResDef) {
     PrintFatalError(ProcModel.ModelDef->getLoc(),
                     Twine("Processor does not define resources for ") +
-                    SchedWrite.TheDef->getName());
+                        SchedWrite.TheDef->getName());
   }
   return ResDef;
 }
@@ -935,16 +934,18 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   Record *AliasDef = nullptr;
   for (Record *A : SchedRead.Aliases) {
     const CodeGenSchedRW &AliasRW =
-      SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
+        SchedModels.getSchedRW(A->getValueAsDef("AliasRW"));
     if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
       Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
       if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
         continue;
     }
     if (AliasDef)
-      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
-                    "defined for processor " + ProcModel.ModelName +
-                    " Ensure only one SchedAlias exists per RW.");
+      PrintFatalError(AliasRW.TheDef->getLoc(),
+                      "Multiple aliases "
+                      "defined for processor " +
+                          ProcModel.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
     AliasDef = AliasRW.TheDef;
   }
   if (AliasDef && AliasDef->isSubClassOf("SchedReadAdvance"))
@@ -955,12 +956,12 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   for (Record *RA : ProcModel.ReadAdvanceDefs) {
     if (!RA->isSubClassOf("ReadAdvance"))
       continue;
-    if (AliasDef == RA->getValueAsDef("ReadType")
-        || SchedRead.TheDef == RA->getValueAsDef("ReadType")) {
+    if (AliasDef == RA->getValueAsDef("ReadType") ||
+        SchedRead.TheDef == RA->getValueAsDef("ReadType")) {
       if (ResDef) {
         PrintFatalError(RA->getLoc(), "Resources are defined for both "
-                      "SchedRead and its alias on processor " +
-                      ProcModel.ModelName);
+                                      "SchedRead and its alias on processor " +
+                                          ProcModel.ModelName);
       }
       ResDef = RA;
     }
@@ -970,7 +971,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
   if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") {
     PrintFatalError(ProcModel.ModelDef->getLoc(),
                     Twine("Processor does not define resources for ") +
-                    SchedRead.TheDef->getName());
+                        SchedRead.TheDef->getName());
   }
   return ResDef;
 }
@@ -994,11 +995,10 @@ void SubtargetEmitter::ExpandProcResources(
         if (SubDef->isSubClassOf("ProcResGroup")) {
           // Disallow this for simplicitly.
           PrintFatalError(SubDef->getLoc(), "Processor resource group "
-                          " cannot be a super resources.");
+                                            " cannot be a super resources.");
         }
-        Record *SuperDef =
-            SchedModels.findProcResUnits(SubDef->getValueAsDef("Super"), PM,
-                                         SubDef->getLoc());
+        Record *SuperDef = SchedModels.findProcResUnits(
+            SubDef->getValueAsDef("Super"), PM, SubDef->getLoc());
         PRVec.push_back(SuperDef);
         ReleaseAtCycles.push_back(ReleaseAtCycles[i]);
         AcquireAtCycles.push_back(AcquireAtCycles[i]);
@@ -1010,7 +1010,7 @@ void SubtargetEmitter::ExpandProcResources(
         continue;
       RecVec SuperResources = PR->getValueAsListOfDefs("Resources");
       RecIter SubI = SubResources.begin(), SubE = SubResources.end();
-      for( ; SubI != SubE; ++SubI) {
+      for (; SubI != SubE; ++SubI) {
         if (!is_contained(SuperResources, *SubI)) {
           break;
         }
@@ -1028,17 +1028,16 @@ void SubtargetEmitter::ExpandProcResources(
 // tables. Must be called for each processor in order.
 void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
                                            SchedClassTables &SchedTables) {
-  SchedTables.ProcSchedClasses.resize(SchedTables.ProcSchedClasses.size() + 1);
+  std::vector<MCSchedClassDesc> &SCTab =
+      SchedTables.ProcSchedClasses.emplace_back();
   if (!ProcModel.hasInstrSchedModel())
     return;
 
-  std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
   LLVM_DEBUG(dbgs() << "\n+++ SCHED CLASSES (GenSchedClassTables) +++\n");
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
     LLVM_DEBUG(SC.dump(&SchedModels));
 
-    SCTab.resize(SCTab.size() + 1);
-    MCSchedClassDesc &SCDesc = SCTab.back();
+    MCSchedClassDesc &SCDesc = SCTab.emplace_back();
     // SCDesc.Name is guarded by NDEBUG
     SCDesc.NumMicroOps = 0;
     SCDesc.BeginGroup = false;
@@ -1051,7 +1050,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     // A Variant SchedClass has no resources of its own.
     bool HasVariants = false;
     for (const CodeGenSchedTransition &CGT :
-           make_range(SC.Transitions.begin(), SC.Transitions.end())) {
+         make_range(SC.Transitions.begin(), SC.Transitions.end())) {
       if (CGT.ProcIndex == ProcModel.Index) {
         HasVariants = true;
         break;
@@ -1114,8 +1113,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
     for (unsigned W : Writes) {
       IdxVec WriteSeq;
-      SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false,
-                                     ProcModel);
+      SchedModels.expandRWSeqForProc(W, WriteSeq, /*IsRead=*/false, ProcModel);
 
       // For each operand, create a latency entry.
       MCWriteLatencyEntry WLEntry;
@@ -1125,7 +1123,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
       // If this Write is not referenced by a ReadAdvance, don't distinguish it
       // from other WriteLatency entries.
       if (!SchedModels.hasReadOfWrite(
-            SchedModels.getSchedWrite(WriteID).TheDef)) {
+              SchedModels.getSchedWrite(WriteID).TheDef)) {
         WriteID = 0;
       }
       WLEntry.WriteResourceID = WriteID;
@@ -1133,7 +1131,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
       for (unsigned WS : WriteSeq) {
 
         Record *WriteRes =
-          FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel);
+            FindWriteResources(SchedModels.getSchedWrite(WS), ProcModel);
 
         // Mark the parent class as invalid for unsupported write types.
         if (WriteRes->getValueAsBit("Unsupported")) {
@@ -1170,7 +1168,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
                   .concat(Twine(ReleaseAtCycles.size())));
         }
 
-        if (!AcquireAtCycles.empty() && AcquireAtCycles.size() != PRVec.size()) {
+        if (!AcquireAtCycles.empty() &&
+            AcquireAtCycles.size() != PRVec.size()) {
           PrintFatalError(
               WriteRes->getLoc(),
               Twine("Inconsistent resource cycles: size(AcquireAtCycles) != "
@@ -1197,8 +1196,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         ExpandProcResources(PRVec, ReleaseAtCycles, AcquireAtCycles, ProcModel);
         assert(AcquireAtCycles.size() == ReleaseAtCycles.size());
 
-        for (unsigned PRIdx = 0, PREnd = PRVec.size();
-             PRIdx != PREnd; ++PRIdx) {
+        for (unsigned PRIdx = 0, PREnd = PRVec.size(); PRIdx != PREnd;
+             ++PRIdx) {
           MCWriteProcResEntry WPREntry;
           WPREntry.ProcResourceIdx = ProcModel.getProcResourceIdx(PRVec[PRIdx]);
           assert(WPREntry.ProcResourceIdx && "Bad ProcResourceIdx");
@@ -1220,9 +1219,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
           // serially, rather than multiple parallel uses. This is important for
           // in-order machine where the resource consumption is a hazard.
           unsigned WPRIdx = 0, WPREnd = WriteProcResources.size();
-          for( ; WPRIdx != WPREnd; ++WPRIdx) {
-            if (WriteProcResources[WPRIdx].ProcResourceIdx
-                == WPREntry.ProcResourceIdx) {
+          for (; WPRIdx != WPREnd; ++WPRIdx) {
+            if (WriteProcResources[WPRIdx].ProcResourceIdx ==
+                WPREntry.ProcResourceIdx) {
               // TODO: multiple use of the same resources would
               // require either 1. thinking of how to handle multiple
               // intervals for the same resource in
@@ -1245,10 +1244,10 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     }
     // Create an entry for each operand Read in this SchedClass.
     // Entries must be sorted first by UseIdx then by WriteResourceID.
-    for (unsigned UseIdx = 0, EndIdx = Reads.size();
-         UseIdx != EndIdx; ++UseIdx) {
+    for (unsigned UseIdx = 0, EndIdx = Reads.size(); UseIdx != EndIdx;
+         ++UseIdx) {
       Record *ReadAdvance =
-        FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
+          FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
       if (!ReadAdvance)
         continue;
 
@@ -1267,7 +1266,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         }
       }
       llvm::sort(WriteIDs);
-      for(unsigned W : WriteIDs) {
+      for (unsigned W : WriteIDs) {
         MCReadAdvanceEntry RAEntry;
         RAEntry.UseIdx = UseIdx;
         RAEntry.WriteResourceID = W;
@@ -1288,9 +1287,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
 
     SCDesc.NumWriteProcResEntries = WriteProcResources.size();
     std::vector<MCWriteProcResEntry>::iterator WPRPos =
-      std::search(SchedTables.WriteProcResources.begin(),
-                  SchedTables.WriteProcResources.end(),
-                  WriteProcResources.begin(), WriteProcResources.end());
+        std::search(SchedTables.WriteProcResources.begin(),
+                    SchedTables.WriteProcResources.end(),
+                    WriteProcResources.begin(), WriteProcResources.end());
     if (WPRPos != SchedTables.WriteProcResources.end())
       SCDesc.WriteProcResIdx = WPRPos - SchedTables.WriteProcResources.begin();
     else {
@@ -1300,10 +1299,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     }
     // Latency entries must remain in operand order.
     SCDesc.NumWriteLatencyEntries = WriteLatencies.size();
-    std::vector<MCWriteLatencyEntry>::iterator WLPos =
-      std::search(SchedTables.WriteLatencies.begin(),
-                  SchedTables.WriteLatencies.end(),
-                  WriteLatencies.begin(), WriteLatencies.end());
+    std::vector<MCWriteLatencyEntry>::iterator WLPos = std::search(
+        SchedTables.WriteLatencies.begin(), SchedTables.WriteLatencies.end(),
+        WriteLatencies.begin(), WriteLatencies.end());
     if (WLPos != SchedTables.WriteLatencies.end()) {
       unsigned idx = WLPos - SchedTables.WriteLatencies.begin();
       SCDesc.WriteLatencyIdx = idx;
@@ -1312,8 +1310,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
             std::string::npos) {
           SchedTables.WriterNames[idx + i] += std::string("_") + WriterNames[i];
         }
-    }
-    else {
+    } else {
       SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size();
       llvm::append_range(SchedTables.WriteLatencies, WriteLatencies);
       llvm::append_range(SchedTables.WriterNames, WriterNames);
@@ -1321,9 +1318,9 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     // ReadAdvanceEntries must remain in operand order.
     SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size();
     std::vector<MCReadAdvanceEntry>::iterator RAPos =
-      std::search(SchedTables.ReadAdvanceEntries.begin(),
-                  SchedTables.ReadAdvanceEntries.end(),
-                  ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
+        std::search(SchedTables.ReadAdvanceEntries.begin(),
+                    SchedTables.ReadAdvanceEntries.end(),
+                    ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
     if (RAPos != SchedTables.ReadAdvanceEntries.end())
       SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin();
     else {
@@ -1355,8 +1352,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit global WriteLatencyTable.
   OS << "\n// {Cycles, WriteResourceID}\n"
-     << "extern const llvm::MCWriteLatencyEntry "
-     << Target << "WriteLatencyTable[] = {\n"
+     << "extern const llvm::MCWriteLatencyEntry " << Target
+     << "WriteLatencyTable[] = {\n"
      << "  { 0,  0}, // Invalid\n";
   for (unsigned WLIdx = 1, WLEnd = SchedTables.WriteLatencies.size();
        WLIdx != WLEnd; ++WLIdx) {
@@ -1371,8 +1368,8 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit global ReadAdvanceTable.
   OS << "\n// {UseIdx, WriteResourceID, Cycles}\n"
-     << "extern const llvm::MCReadAdvanceEntry "
-     << Target << "ReadAdvanceTable[] = {\n"
+     << "extern const llvm::MCReadAdvanceEntry " << Target
+     << "ReadAdvanceTable[] = {\n"
      << "  {0,  0,  0}, // Invalid\n";
   for (unsigned RAIdx = 1, RAEnd = SchedTables.ReadAdvanceEntries.size();
        RAIdx != RAEnd; ++RAIdx) {
@@ -1388,22 +1385,23 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
 
   // Emit a SchedClass table for each processor.
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+                                    PE = SchedModels.procModelEnd();
+       PI != PE; ++PI) {
     if (!PI->hasInstrSchedModel())
       continue;
 
     std::vector<MCSchedClassDesc> &SCTab =
-      SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
+        SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
 
     OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO,"
        << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
-    OS << "static const llvm::MCSchedClassDesc "
-       << PI->ModelName << "SchedClasses[] = {\n";
+    OS << "static const llvm::MCSchedClassDesc " << PI->ModelName
+       << "SchedClasses[] = {\n";
 
     // The first class is always invalid. We no way to distinguish it except by
     // name and position.
-    assert(SchedModels.getSchedClass(0).Name == "NoInstrModel"
-           && "invalid class not first");
+    assert(SchedModels.getSchedClass(0).Name == "NoInstrModel" &&
+           "invalid class not first");
     OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
        << MCSchedClassDesc::InvalidNumMicroOps
        << ", false, false, false, 0, 0,  0, 0,  0, 0},\n";
@@ -1414,17 +1412,15 @@ void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
       OS << "  {DBGFIELD(\"" << SchedClass.Name << "\") ";
       if (SchedClass.Name.size() < 18)
         OS.indent(18 - SchedClass.Name.size());
-      OS << MCDesc.NumMicroOps
-         << ", " << ( MCDesc.BeginGroup ? "true" : "false" )
-         << ", " << ( MCDesc.EndGroup ? "true" : "false" )
-         << ", " << ( MCDesc.RetireOOO ? "true" : "false" )
-         << ", " << format("%2d", MCDesc.WriteProcResIdx)
-         << ", " << MCDesc.NumWriteProcResEntries
-         << ", " << format("%2d", MCDesc.WriteLatencyIdx)
-         << ", " << MCDesc.NumWriteLatencyEntries
-         << ", " << format("%2d", MCDesc.ReadAdvanceIdx)
-         << ", " << MCDesc.NumReadAdvanceEntries
-         << "}, // #" << SCIdx << '\n';
+      OS << MCDesc.NumMicroOps << ", " << (MCDesc.BeginGroup ? "true" : "false")
+         << ", " << (MCDesc.EndGroup ? "true" : "false") << ", "
+         << (MCDesc.RetireOOO ? "true" : "false") << ", "
+         << format("%2d", MCDesc.WriteProcResIdx) << ", "
+         << MCDesc.NumWriteProcResEntries << ", "
+         << format("%2d", MCDesc.WriteLatencyIdx) << ", "
+         << MCDesc.NumWriteLatencyEntries << ", "
+         << format("%2d", MCDesc.ReadAdvanceIdx) << ", "
+         << MCDesc.NumReadAdvanceEntries << "}, // #" << SCIdx << '\n';
     }
     OS << "}; // " << PI->ModelName << "SchedClasses\n";
   }
@@ -1439,9 +1435,10 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     // Emit processor resource table.
     if (PM.hasInstrSchedModel())
       EmitProcessorResources(PM, OS);
-    else if(!PM.ProcResourceDefs.empty())
-      PrintFatalError(PM.ModelDef->getLoc(), "SchedMachineModel defines "
-                    "ProcResources without defining WriteRes SchedWriteRes");
+    else if (!PM.ProcResourceDefs.empty())
+      PrintFatalError(PM.ModelDef->getLoc(),
+                      "SchedMachineModel defines "
+                      "ProcResources without defining WriteRes SchedWriteRes");
 
     // Begin processor itinerary properties
     OS << "\n";
@@ -1454,13 +1451,13 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ',');
 
     bool PostRAScheduler =
-      (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false);
+        (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false);
 
-    OS << "  " << (PostRAScheduler ? "true" : "false")  << ", // "
+    OS << "  " << (PostRAScheduler ? "true" : "false") << ", // "
        << "PostRAScheduler\n";
 
     bool CompleteModel =
-      (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false);
+        (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false);
 
     OS << "  " << (CompleteModel ? "true" : "false") << ", // "
        << "CompleteModel\n";
@@ -1473,11 +1470,14 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
 
     OS << "  " << PM.Index << ", // Processor ID\n";
     if (PM.hasInstrSchedModel())
-      OS << "  " << PM.ModelName << "ProcResources" << ",\n"
-         << "  " << PM.ModelName << "SchedClasses" << ",\n"
-         << "  " << PM.ProcResourceDefs.size()+1 << ",\n"
-         << "  " << (SchedModels.schedClassEnd()
-                     - SchedModels.schedClassBegin()) << ",\n";
+      OS << "  " << PM.ModelName << "ProcResources"
+         << ",\n"
+         << "  " << PM.ModelName << "SchedClasses"
+         << ",\n"
+         << "  " << PM.ProcResourceDefs.size() + 1 << ",\n"
+         << "  "
+         << (SchedModels.schedClassEnd() - SchedModels.schedClassBegin())
+         << ",\n";
     else
       OS << "  nullptr, nullptr, 0, 0,"
          << " // No instruction-level machine model.\n";
@@ -1647,7 +1647,7 @@ static void collectProcessorIndices(const CodeGenSchedClass &SC,
     IdxVec PI;
     std::set_union(&T.ProcIndex, &T.ProcIndex + 1, ProcIndices.begin(),
                    ProcIndices.end(), std::back_inserter(PI));
-    ProcIndices.swap(PI);
+    ProcIndices = std::move(PI);
   }
 }
 
@@ -1669,8 +1669,8 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
   // Construct a switch statement where the condition is a check on the
   // scheduling class identifier. There is a `case` for every variant class
   // defined by the processor models of this target.
-  // Each `case` implements a number of rules to resolve (i.e. to transition from)
-  // a variant scheduling class to another scheduling class.  Rules are
+  // Each `case` implements a number of rules to resolve (i.e. to transition
+  // from) a variant scheduling class to another scheduling class.  Rules are
   // described by instances of CodeGenSchedTransition. Note that transitions may
   // not be valid for all processors.
   OS << "  switch (SchedClass) {\n";
@@ -1781,8 +1781,8 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
   OS << "unsigned " << ClassName << "::getHwMode() const {\n";
   for (unsigned M = 1, NumModes = CGH.getNumModeIds(); M != NumModes; ++M) {
     const HwMode &HM = CGH.getMode(M);
-    OS << "  if (checkFeatures(\"" << HM.Features
-       << "\")) return " << M << ";\n";
+    OS << "  if (checkFeatures(\"" << HM.Features << "\")) return " << M
+       << ";\n";
   }
   OS << "  return 0;\n}\n";
 }
@@ -1808,8 +1808,8 @@ void SubtargetEmitter::emitGetMacroFusions(const std::string &ClassName,
 // Produces a subtarget specific function for parsing
 // the subtarget features string.
 void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
-  std::vector<Record*> Features =
-                       Records.getAllDerivedDefinitions("SubtargetFeature");
+  std::vector<Record *> Features =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
   llvm::sort(Features, LessRecord());
 
   OS << "// ParseSubtargetFeatures - Parses features string setting specified\n"
@@ -1836,15 +1836,12 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
     StringRef Value = R->getValueAsString("Value");
     StringRef FieldName = R->getValueAsString("FieldName");
 
-    if (Value=="true" || Value=="false")
-      OS << "  if (Bits[" << Target << "::"
-         << Instance << "]) "
-         << FieldName << " = " << Value << ";\n";
+    if (Value == "true" || Value == "false")
+      OS << "  if (Bits[" << Target << "::" << Instance << "]) " << FieldName
+         << " = " << Value << ";\n";
     else
-      OS << "  if (Bits[" << Target << "::"
-         << Instance << "] && "
-         << FieldName << " < " << Value << ") "
-         << FieldName << " = " << Value << ";\n";
+      OS << "  if (Bits[" << Target << "::" << Instance << "] && " << FieldName
+         << " < " << Value << ") " << FieldName << " = " << Value << ";\n";
   }
 
   OS << "}\n";
@@ -1955,15 +1952,15 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "std::nullopt, ";
-  OS << '\n'; OS.indent(22);
-  OS << Target << "WriteProcResTable, "
-     << Target << "WriteLatencyTable, "
+  OS << '\n';
+  OS.indent(22);
+  OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, "
      << Target << "ReadAdvanceTable, ";
-  OS << '\n'; OS.indent(22);
+  OS << '\n';
+  OS.indent(22);
   if (SchedModels.hasItineraries()) {
-    OS << Target << "Stages, "
-       << Target << "OperandCycles, "
-       << Target << "ForwardingPaths";
+    OS << Target << "Stages, " << Target << "OperandCycles, " << Target
+       << "ForwardingPaths";
   } else
     OS << "nullptr, nullptr, nullptr";
   OS << ");\n}\n\n";
@@ -2027,12 +2024,12 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "namespace llvm {\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetSubTypeKV " << Target << "SubTypeKV[];\n";
-  OS << "extern const llvm::MCWriteProcResEntry "
-     << Target << "WriteProcResTable[];\n";
-  OS << "extern const llvm::MCWriteLatencyEntry "
-     << Target << "WriteLatencyTable[];\n";
-  OS << "extern const llvm::MCReadAdvanceEntry "
-     << Target << "ReadAdvanceTable[];\n";
+  OS << "extern const llvm::MCWriteProcResEntry " << Target
+     << "WriteProcResTable[];\n";
+  OS << "extern const llvm::MCWriteLatencyEntry " << Target
+     << "WriteLatencyTable[];\n";
+  OS << "extern const llvm::MCReadAdvanceEntry " << Target
+     << "ReadAdvanceTable[];\n";
 
   if (SchedModels.hasItineraries()) {
     OS << "extern const llvm::InstrStage " << Target << "Stages[];\n";
@@ -2051,15 +2048,15 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << "ArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), ";
   else
     OS << "std::nullopt, ";
-  OS << '\n'; OS.indent(24);
-  OS << Target << "WriteProcResTable, "
-     << Target << "WriteLatencyTable, "
+  OS << '\n';
+  OS.indent(24);
+  OS << Target << "WriteProcResTable, " << Target << "WriteLatencyTable, "
      << Target << "ReadAdvanceTable, ";
-  OS << '\n'; OS.indent(24);
+  OS << '\n';
+  OS.indent(24);
   if (SchedModels.hasItineraries()) {
-    OS << Target << "Stages, "
-       << Target << "OperandCycles, "
-       << Target << "ForwardingPaths";
+    OS << Target << "Stages, " << Target << "OperandCycles, " << Target
+       << "ForwardingPaths";
   } else
     OS << "nullptr, nullptr, nullptr";
   OS << ") {}\n\n";
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 52afb4d89162..819abfa965a5 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -81,7 +81,7 @@ void SubtargetFeatureInfo::emitNameTable(
   uint64_t IndexUB = 0;
   for (const auto &SF : SubtargetFeatures)
     if (IndexUB <= SF.second.Index)
-      IndexUB = SF.second.Index+1;
+      IndexUB = SF.second.Index + 1;
 
   std::vector<std::string> Names;
   if (IndexUB > 0)
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h
index 940100448435..b1016ff24e88 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -18,7 +18,8 @@
 
 namespace llvm {
 struct SubtargetFeatureInfo;
-using SubtargetFeatureInfoMap = std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
+using SubtargetFeatureInfoMap =
+    std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
 
 /// Helper class for storing information on a subtarget feature which
 /// participates in instruction matching.
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 3afe6b01467b..e0d12abaaa03 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -46,7 +46,6 @@
 // backends, this means that the EmitFoo function is the only thing not in
 // the anonymous namespace.
 
-
 // FIXME: Reorganize TableGen so that build dependencies can be more
 // accurately expressed. Currently, touching any of the emitters (or
 // anything that they transitively depend on) causes everything dependent
@@ -57,7 +56,6 @@
 // TableGen binary with as few dependencies as possible on the rest of
 // LLVM.
 
-
 namespace llvm {
 
 class raw_ostream;
diff --git a/llvm/utils/TableGen/Types.cpp b/llvm/utils/TableGen/Types.cpp
index aca8e36b683d..35b79b320dc3 100644
--- a/llvm/utils/TableGen/Types.cpp
+++ b/llvm/utils/TableGen/Types.cpp
@@ -15,7 +15,9 @@
 
 using namespace llvm;
 
-const char *llvm::getMinimalTypeForRange(uint64_t Range, unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
+const char *
+llvm::getMinimalTypeForRange(uint64_t Range,
+                             unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
   // TODO: The original callers only used 32 and 64 so these are the only
   //       values permitted. Rather than widen the supported values we should
   //       allow 64 for the callers that currently use 32 and remove the
diff --git a/llvm/utils/TableGen/Types.h b/llvm/utils/TableGen/Types.h
index f369d61785c4..74f0f9f2792c 100644
--- a/llvm/utils/TableGen/Types.h
+++ b/llvm/utils/TableGen/Types.h
@@ -16,6 +16,6 @@ namespace llvm {
 /// MaxSize indicates the largest size of integer to consider (in bits) and only
 /// supports values of at least 32.
 const char *getMinimalTypeForRange(uint64_t Range, unsigned MaxSize = 64);
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index dc037e4409ab..2cf86d38a212 100644
--- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -63,7 +63,8 @@ void emitWebAssemblyDisassemblerTables(
       // should be the canonical one. This determines which variant gets
       // printed in a disassembly. We want e.g. "call" not "i32.call", and
       // "end" when we don't know if its "end_loop" or "end_block" etc.
-      bool IsCanonicalExisting = CGIP.second->TheDef->getValueAsBit("IsCanonical");
+      bool IsCanonicalExisting =
+          CGIP.second->TheDef->getValueAsBit("IsCanonical");
       // We already have one marked explicitly as canonical, so keep it.
       if (IsCanonicalExisting)
         continue;
@@ -126,7 +127,8 @@ void emitWebAssemblyDisassemblerTables(
                ++J) {
             size_t K = 0;
             for (; K < CurOperandList.size(); ++K) {
-              if (OperandTable[J + K] != CurOperandList[K]) break;
+              if (OperandTable[J + K] != CurOperandList[K])
+                break;
             }
             if (K == CurOperandList.size()) {
               OperandStart = J;
diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
index fef8dc7236f5..e4db995d47f7 100644
--- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
@@ -83,7 +83,8 @@ void X86CompressEVEXTablesEmitter::printTable(const std::vector<Entry> &Table,
 void X86CompressEVEXTablesEmitter::printCheckPredicate(
     const PredicateInstMap &PredicateInsts, raw_ostream &OS) {
 
-  OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget *Subtarget) {\n"
+  OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget "
+        "*Subtarget) {\n"
      << "  switch (Opc) {\n"
      << "  default: return true;\n";
   for (const auto &[Key, Val] : PredicateInsts) {
@@ -207,9 +208,9 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) {
           NewInst = &TempInst;
       }
     } else {
-      // For each pre-compression instruction look for a match in the appropriate
-      // vector (instructions with the same opcode) using function object
-      // IsMatch.
+      // For each pre-compression instruction look for a match in the
+      // appropriate vector (instructions with the same opcode) using function
+      // object IsMatch.
       auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst));
       if (Match != CompressedInsts[Opcode].end())
         NewInst = *Match;
@@ -225,7 +226,7 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) {
       return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" ||
              Name == "HasAVXIFMA";
     });
-    if(It!= Predicates.end())
+    if (It != Predicates.end())
       PredicateInsts[*It].push_back(NewInst);
   }
 
diff --git a/llvm/utils/TableGen/X86DisassemblerShared.h b/llvm/utils/TableGen/X86DisassemblerShared.h
index 093f220fda5e..f60fd477788a 100644
--- a/llvm/utils/TableGen/X86DisassemblerShared.h
+++ b/llvm/utils/TableGen/X86DisassemblerShared.h
@@ -49,9 +49,7 @@ struct OpcodeDecision {
 struct ContextDecision {
   OpcodeDecision opcodeDecisions[llvm::X86Disassembler::IC_max];
 
-  ContextDecision() {
-    memset(opcodeDecisions, 0, sizeof(opcodeDecisions));
-  }
+  ContextDecision() { memset(opcodeDecisions, 0, sizeof(opcodeDecisions)); }
 };
 
 #endif
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 23886a346824..588d9b22a772 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -41,8 +41,9 @@ static inline const char *stringForContext(InstructionContext insnContext) {
     break;
 #define ENUM_ENTRY_K_B(n, r, d)                                                \
   ENUM_ENTRY(n, r, d)                                                          \
-  ENUM_ENTRY(n##_K_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d)   \
-      ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d)
+  ENUM_ENTRY(n##_K_B, r, d)                                                    \
+  ENUM_ENTRY(n##_KZ, r, d)                                                     \
+  ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d)
     INSTRUCTION_CONTEXTS
 #undef ENUM_ENTRY
 #undef ENUM_ENTRY_K_B
@@ -595,8 +596,8 @@ static inline bool outranks(InstructionContext upper,
 #define ENUM_ENTRY_K_B(n, r, d)                                                \
   ENUM_ENTRY(n, r, d)                                                          \
   ENUM_ENTRY(n##_K_B, r, d)                                                    \
-  ENUM_ENTRY(n##_KZ_B, r, d) ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d)  \
-      ENUM_ENTRY(n##_B, r, d)
+  ENUM_ENTRY(n##_KZ_B, r, d)                                                   \
+  ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)
   static int ranks[IC_max] = {INSTRUCTION_CONTEXTS};
 #undef ENUM_ENTRY
 #undef ENUM_ENTRY_K_B
@@ -822,7 +823,8 @@ void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2,
   }
 
   i2--;
-  o2.indent(i2) << "}};" << "\n";
+  o2.indent(i2) << "}};"
+                << "\n";
 }
 
 void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
@@ -859,7 +861,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     }
     o << "  },\n";
   }
-  o << "};" << "\n\n";
+  o << "};"
+    << "\n\n";
 
   o.indent(i * 2) << "static const struct InstructionSpecifier ";
   o << INSTRUCTIONS_STR "[" << InstructionSpecifiers.size() << "] = {\n";
@@ -885,7 +888,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
   }
 
   i--;
-  o.indent(i * 2) << "};" << "\n";
+  o.indent(i * 2) << "};"
+                  << "\n";
 }
 
 void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
@@ -1004,7 +1008,8 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
   }
 
   i--;
-  o.indent(i * 2) << "};" << "\n";
+  o.indent(i * 2) << "};"
+                  << "\n";
 }
 
 void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h
index 4fbc58bea338..0f382741530c 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -91,8 +91,8 @@ private:
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The ModR/M decision to emit.  This decision has 256
   ///                   entries - emitModRMDecision decides how to compact it.
-  void emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
-                         unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
+  void emitModRMDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                         unsigned &i2, unsigned &ModRMTableNum,
                          ModRMDecision &decision) const;
 
   /// emitOpcodeDecision - Emits an OpcodeDecision and all its subsidiary ModR/M
@@ -119,8 +119,8 @@ private:
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The OpcodeDecision to emit along with its subsidiary
   ///                    structures.
-  void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
-                          unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
+  void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                          unsigned &i2, unsigned &ModRMTableNum,
                           OpcodeDecision &decision) const;
 
   /// emitContextDecision - Emits a ContextDecision and all its subsidiary
@@ -153,9 +153,9 @@ private:
   /// @param decision - The ContextDecision to emit along with its subsidiary
   ///                   structures.
   /// @param name     - The name for the ContextDecision.
-  void emitContextDecision(raw_ostream &o1, raw_ostream &o2,
-                           unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
-                           ContextDecision &decision, const char* name) const;
+  void emitContextDecision(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                           unsigned &i2, unsigned &ModRMTableNum,
+                           ContextDecision &decision, const char *name) const;
 
   /// emitInstructionInfo - Prints the instruction specifier table, which has
   ///   one entry for each instruction, and contains name and operand
@@ -200,7 +200,8 @@ private:
   ///   IC is the context corresponding to the mask 0x00, and there are 256
   ///   possible masks.
   ///
-  /// @param o  - The output stream to which the context table should be written.
+  /// @param o  - The output stream to which the context table should be
+  /// written.
   /// @param i  - The indent level for use with the stream.
   void emitContextTable(raw_ostream &o, uint32_t &i) const;
 
@@ -213,9 +214,8 @@ private:
   /// @param i1 - The indent level to use with stream o1.
   /// @param i2 - The indent level to use with stream o2.
   /// @param ModRMTableNum - next table number for adding to ModRMTable.
-  void emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
-                            unsigned &i1, unsigned &i2,
-                            unsigned &ModRMTableNum) const;
+  void emitContextDecisions(raw_ostream &o1, raw_ostream &o2, unsigned &i1,
+                            unsigned &i2, unsigned &ModRMTableNum) const;
 
   /// setTableFields - Uses a ModRMFilter to set the appropriate entries in a
   ///   ModRMDecision to refer to a particular instruction ID.
@@ -224,10 +224,9 @@ private:
   /// @param filter   - The filter to use in deciding which entries to populate.
   /// @param uid      - The unique ID to set matching entries to.
   /// @param opcode   - The opcode of the instruction, for error reporting.
-  void setTableFields(ModRMDecision &decision,
-                      const ModRMFilter &filter,
-                      InstrUID uid,
-                      uint8_t opcode);
+  void setTableFields(ModRMDecision &decision, const ModRMFilter &filter,
+                      InstrUID uid, uint8_t opcode);
+
 public:
   /// Constructor - Allocates space for the class decisions and clears them.
   DisassemblerTables();
@@ -247,7 +246,8 @@ public:
   /// @param insnContext  - The context to use (IC, IC_64BIT, etc.)
   /// @param opcode       - The last byte of the opcode (not counting any escape
   ///                       or extended opcodes).
-  /// @param filter       - The ModRMFilter that decides which ModR/M byte values
+  /// @param filter       - The ModRMFilter that decides which ModR/M byte
+  /// values
   ///                       correspond to the desired instruction.
   /// @param uid          - The unique ID of the instruction.
   /// @param is32bit      - Instructon is only 32-bit
@@ -255,23 +255,17 @@ public:
   /// @param ignoresVEX_L - Instruction ignores VEX.L
   /// @param ignoresVEX_W - Instruction ignores VEX.W
   /// @param AddrSize     - Instructions address size 16/32/64. 0 is unspecified
-  void setTableFields(OpcodeType type,
-                      InstructionContext insnContext,
-                      uint8_t opcode,
-                      const ModRMFilter &filter,
-                      InstrUID uid,
-                      bool is32bit,
-                      bool noPrefix,
-                      bool ignoresVEX_L,
-                      bool ignoresVEX_W,
-                      unsigned AddrSize);
+  void setTableFields(OpcodeType type, InstructionContext insnContext,
+                      uint8_t opcode, const ModRMFilter &filter, InstrUID uid,
+                      bool is32bit, bool noPrefix, bool ignoresVEX_L,
+                      bool ignoresVEX_W, unsigned AddrSize);
 
   /// specForUID - Returns the instruction specifier for a given unique
   ///   instruction ID.  Used when resolving collisions.
   ///
   /// @param uid  - The unique ID of the instruction.
   /// @return     - A reference to the instruction specifier.
-  InstructionSpecifier& specForUID(InstrUID uid) {
+  InstructionSpecifier &specForUID(InstrUID uid) {
     if (uid >= InstructionSpecifiers.size())
       InstructionSpecifiers.resize(uid + 1);
 
@@ -282,9 +276,7 @@ public:
   //   from any instructions added to the tables.
   // @return  - true if there were; false otherwise.
 
-  bool hasConflicts() {
-    return HasConflicts;
-  }
+  bool hasConflicts() { return HasConflicts; }
 };
 
 } // namespace X86Disassembler
diff --git a/llvm/utils/TableGen/X86ModRMFilters.cpp b/llvm/utils/TableGen/X86ModRMFilters.cpp
index cf7507094fa7..9cfb91c42a17 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.cpp
+++ b/llvm/utils/TableGen/X86ModRMFilters.cpp
@@ -10,14 +10,14 @@
 
 using namespace llvm::X86Disassembler;
 
-void ModRMFilter::anchor() { }
+void ModRMFilter::anchor() {}
 
-void DumbFilter::anchor() { }
+void DumbFilter::anchor() {}
 
-void ModFilter::anchor() { }
+void ModFilter::anchor() {}
 
-void ExtendedFilter::anchor() { }
+void ExtendedFilter::anchor() {}
 
-void ExtendedRMFilter::anchor() { }
+void ExtendedRMFilter::anchor() {}
 
-void ExactFilter::anchor() { }
+void ExactFilter::anchor() {}
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index d2169a8e879b..b579f229379b 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -27,9 +27,10 @@ namespace X86Disassembler {
 ///   ModR/M bytes.
 class ModRMFilter {
   virtual void anchor();
+
 public:
   /// Destructor    - Override as necessary.
-  virtual ~ModRMFilter() { }
+  virtual ~ModRMFilter() {}
 
   /// isDumb        - Indicates whether this filter returns the same value for
   ///                 any value of the ModR/M byte.
@@ -50,14 +51,11 @@ public:
 ///   for operands.
 class DumbFilter : public ModRMFilter {
   void anchor() override;
+
 public:
-  bool isDumb() const override {
-    return true;
-  }
+  bool isDumb() const override { return true; }
 
-  bool accepts(uint8_t modRM) const override {
-    return true;
-  }
+  bool accepts(uint8_t modRM) const override { return true; }
 };
 
 /// ModFilter - Filters based on the mod bits [bits 7-6] of the ModR/M byte.
@@ -66,6 +64,7 @@ public:
 class ModFilter : public ModRMFilter {
   void anchor() override;
   bool R;
+
 public:
   /// Constructor
   ///
@@ -86,6 +85,7 @@ class ExtendedFilter : public ModRMFilter {
   void anchor() override;
   bool R;
   uint8_t NNN;
+
 public:
   /// Constructor
   ///
@@ -95,9 +95,9 @@ public:
   ExtendedFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {}
 
   bool accepts(uint8_t modRM) const override {
-    return (((R  && ((modRM & 0xc0) == 0xc0)) ||
-             (!R && ((modRM & 0xc0) != 0xc0))) &&
-            (((modRM & 0x38) >> 3) == NNN));
+    return (
+        ((R && ((modRM & 0xc0) == 0xc0)) || (!R && ((modRM & 0xc0) != 0xc0))) &&
+        (((modRM & 0x38) >> 3) == NNN));
   }
 };
 
@@ -107,6 +107,7 @@ class ExtendedRMFilter : public ModRMFilter {
   void anchor() override;
   bool R;
   uint8_t NNN;
+
 public:
   /// Constructor
   ///
@@ -116,8 +117,7 @@ public:
   ExtendedRMFilter(bool r, uint8_t nnn) : R(r), NNN(nnn) {}
 
   bool accepts(uint8_t modRM) const override {
-    return ((R && ((modRM & 0xc0) == 0xc0)) &&
-            ((modRM & 0x7) == NNN));
+    return ((R && ((modRM & 0xc0) == 0xc0)) && ((modRM & 0x7) == NNN));
   }
 };
 /// ExactFilter - The occasional extended opcode (such as VMCALL or MONITOR)
@@ -125,15 +125,14 @@ public:
 class ExactFilter : public ModRMFilter {
   void anchor() override;
   uint8_t ModRM;
+
 public:
   /// Constructor
   ///
   /// \param modRM The required value of the full ModR/M byte.
   ExactFilter(uint8_t modRM) : ModRM(modRM) {}
 
-  bool accepts(uint8_t modRM) const override {
-    return (ModRM == modRM);
-  }
+  bool accepts(uint8_t modRM) const override { return (ModRM == modRM); }
 };
 
 } // namespace X86Disassembler
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 18f961065c23..873f3aea053c 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1,4 +1,4 @@
-//===- X86RecognizableInstr.cpp - Disassembler instruction spec --*- C++ -*-===//
+//===- X86RecognizableInstr.cpp - Disassembler instruction spec -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index 007c700cdfaf..549fc5b4d543 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -1,4 +1,4 @@
-//===- X86RecognizableInstr.h - Disassembler instruction spec ----*- C++ -*-===//
+//===- X86RecognizableInstr.h - Disassembler instruction spec ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index e2b84aeca3ad..ccef274c4c1f 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -298,6 +298,55 @@ If you don't get any reports, no action is required from you. Your changes are w
         return True
 
 
+class PRMergeOnBehalfInformation:
+    COMMENT_TAG = "<!--LLVM MERGE ON BEHALF INFORMATION COMMENT-->\n"
+
+    def __init__(
+        self, token: str, repo: str, pr_number: int, author: str, reviewer: str
+    ):
+        self.repo = github.Github(token).get_repo(repo)
+        self.pr = self.repo.get_issue(pr_number).as_pull_request()
+        self.author = author
+        self.reviewer = reviewer
+
+    def can_merge(self, user: str) -> bool:
+        try:
+            return self.repo.get_collaborator_permission(user) in ["admin", "write"]
+        # There is a UnknownObjectException for this scenario, but this method
+        # does not use it.
+        except github.GithubException as e:
+            # 404 means the author was not found in the collaborator list, so we
+            # know they don't have push permissions. Anything else is a real API
+            # issue, raise it so it is visible.
+            if e.status != 404:
+                raise e
+            return False
+
+    def run(self) -> bool:
+        # Check this first because it only costs 1 API point.
+        if self.can_merge(self.author):
+            return
+
+        # A review can be approved more than once, only comment the first time.
+        for comment in self.pr.as_issue().get_comments():
+            if self.COMMENT_TAG in comment.body:
+                return
+
+        # This text is using Markdown formatting.
+        if self.can_merge(self.reviewer):
+            comment = f"""\
+{self.COMMENT_TAG}
+@{self.reviewer} the PR author does not have permission to merge their own PRs yet. Please merge on their behalf."""
+        else:
+            comment = f"""\
+{self.COMMENT_TAG}
+@{self.reviewer} the author of this PR does not have permission to merge and neither do you.
+Please find someone who has merge permissions who can merge it on the author's behalf. This could be one of the other reviewers or you can ask on [Discord](https://discord.com/invite/xS7Z362)."""
+
+        self.pr.as_issue().create_comment(comment)
+        return True
+
+
 def setup_llvmbot_git(git_dir="."):
     """
     Configure the git repo in `git_dir` with the llvmbot account so
@@ -343,6 +392,7 @@ class ReleaseWorkflow:
         branch_repo_name: str,
         branch_repo_token: str,
         llvm_project_dir: str,
+        requested_by: str,
     ) -> None:
         self._token = token
         self._repo_name = repo
@@ -353,6 +403,7 @@ class ReleaseWorkflow:
         else:
             self._branch_repo_token = self.token
         self._llvm_project_dir = llvm_project_dir
+        self._requested_by = requested_by
 
     @property
     def token(self) -> str:
@@ -383,6 +434,10 @@ class ReleaseWorkflow:
         return self._llvm_project_dir
 
     @property
+    def requested_by(self) -> str:
+        return self._requested_by
+
+    @property
     def repo(self) -> github.Repository.Repository:
         return github.Github(self.token).get_repo(self.repo_name)
 
@@ -536,7 +591,7 @@ class ReleaseWorkflow:
 
         self.issue_remove_cherry_pick_failed_label()
         return self.create_pull_request(
-            self.branch_repo_owner, self.repo_name, branch_name
+            self.branch_repo_owner, self.repo_name, branch_name, commits
         )
 
     def check_if_pull_request_exists(
@@ -545,7 +600,9 @@ class ReleaseWorkflow:
         pulls = repo.get_pulls(head=head)
         return pulls.totalCount != 0
 
-    def create_pull_request(self, owner: str, repo_name: str, branch: str) -> bool:
+    def create_pull_request(
+        self, owner: str, repo_name: str, branch: str, commits: List[str]
+    ) -> bool:
         """
         Create a pull request in `self.repo_name`.  The base branch of the
         pull request will be chosen based on the the milestone attached to
@@ -567,9 +624,15 @@ class ReleaseWorkflow:
             print("PR already exists...")
             return True
         try:
+            commit_message = repo.get_commit(commits[-1]).commit.message
+            message_lines = commit_message.splitlines()
+            title = "{}: {}".format(release_branch_for_issue, message_lines[0])
+            body = "Backport {}\n\nRequested by: @{}".format(
+                " ".join(commits), self.requested_by
+            )
             pull = repo.create_pull(
-                title=f"PR for {issue_ref}",
-                body="resolves {}".format(issue_ref),
+                title=title,
+                body=body,
                 base=release_branch_for_issue,
                 head=head,
                 maintainer_can_modify=False,
@@ -651,6 +714,17 @@ pr_buildbot_information_parser = subparsers.add_parser("pr-buildbot-information"
 pr_buildbot_information_parser.add_argument("--issue-number", type=int, required=True)
 pr_buildbot_information_parser.add_argument("--author", type=str, required=True)
 
+pr_merge_on_behalf_information_parser = subparsers.add_parser(
+    "pr-merge-on-behalf-information"
+)
+pr_merge_on_behalf_information_parser.add_argument(
+    "--issue-number", type=int, required=True
+)
+pr_merge_on_behalf_information_parser.add_argument("--author", type=str, required=True)
+pr_merge_on_behalf_information_parser.add_argument(
+    "--reviewer", type=str, required=True
+)
+
 release_workflow_parser = subparsers.add_parser("release-workflow")
 release_workflow_parser.add_argument(
     "--llvm-project-dir",
@@ -683,6 +757,12 @@ llvmbot_git_config_parser = subparsers.add_parser(
     "setup-llvmbot-git",
     help="Set the default user and email for the git repo in LLVM_PROJECT_DIR to llvmbot",
 )
+release_workflow_parser.add_argument(
+    "--requested-by",
+    type=str,
+    required=True,
+    help="The user that requested this backport",
+)
 
 args = parser.parse_args()
 
@@ -704,6 +784,11 @@ elif args.command == "pr-buildbot-information":
         args.token, args.repo, args.issue_number, args.author
     )
     pr_buildbot_information.run()
+elif args.command == "pr-merge-on-behalf-information":
+    pr_merge_on_behalf_information = PRMergeOnBehalfInformation(
+        args.token, args.repo, args.issue_number, args.author, args.reviewer
+    )
+    pr_merge_on_behalf_information.run()
 elif args.command == "release-workflow":
     release_workflow = ReleaseWorkflow(
         args.token,
@@ -712,6 +797,7 @@ elif args.command == "release-workflow":
         args.branch_repo,
         args.branch_repo_token,
         args.llvm_project_dir,
+        args.requested_by,
     )
     if not release_workflow.release_branch_for_issue:
         release_workflow.issue_notify_no_milestone(sys.stdin.readlines())
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index 4babd37e9485..d484ff9cddd9 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -100,6 +100,10 @@ clang_tablegen("BuiltinsBPF") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsRISCV") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 # ARM CDE, MVE, and NEON.
 
 clang_tablegen("arm_neon") {
diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index ff793190bd28..14cf759266e5 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -141,6 +141,7 @@ static_library("AST") {
     "StmtCXX.cpp",
     "StmtIterator.cpp",
     "StmtObjC.cpp",
+    "StmtOpenACC.cpp",
     "StmtOpenMP.cpp",
     "StmtPrinter.cpp",
     "StmtProfile.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index 1486d16a6461..bbe937301620 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -27,6 +27,7 @@ static_library("Basic") {
     "//clang/include/clang/Basic:AttrSubMatchRulesList",
     "//clang/include/clang/Basic:Builtins",
     "//clang/include/clang/Basic:BuiltinsBPF",
+    "//clang/include/clang/Basic:BuiltinsRISCV",
     "//clang/include/clang/Basic:DiagnosticGroups",
     "//clang/include/clang/Basic:RegularKeywordAttrInfo",
     "//clang/include/clang/Basic:arm_cde_builtins",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index d7fee7da8479..45125ce25085 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -505,7 +505,6 @@ if (current_toolchain == default_toolchain) {
       "__fwd/array.h",
       "__fwd/bit_reference.h",
       "__fwd/fstream.h",
-      "__fwd/get.h",
       "__fwd/hash.h",
       "__fwd/ios.h",
       "__fwd/istream.h",
diff --git a/llvm/utils/gn/secondary/lld/ELF/BUILD.gn b/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
index bd4a4f5d984b..d9037256e142 100644
--- a/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/ELF/BUILD.gn
@@ -39,6 +39,7 @@ static_library("ELF") {
     "Arch/PPC64.cpp",
     "Arch/RISCV.cpp",
     "Arch/SPARCV9.cpp",
+    "Arch/SystemZ.cpp",
     "Arch/X86.cpp",
     "Arch/X86_64.cpp",
     "CallGraphSort.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 60562eff89dd..6c667b280aa7 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -37,6 +37,7 @@ static_library("CPlusPlus") {
     "LibCxxSpan.cpp",
     "LibCxxTuple.cpp",
     "LibCxxUnorderedMap.cpp",
+    "LibCxxValarray.cpp",
     "LibCxxVariant.cpp",
     "LibCxxVector.cpp",
     "LibStdcpp.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index d8292df8c0e7..98c2068f6da2 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -38,6 +38,7 @@ executable("lldb-dap") {
   # FIXME: rpath/install_name stuff on macOS for framework on macOS
 
   sources = [
+    "Breakpoint.cpp",
     "BreakpointBase.cpp",
     "DAP.cpp",
     "ExceptionBreakpoint.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
index a594d2afbf7d..87e58608617a 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/IR/BUILD.gn
@@ -85,6 +85,10 @@ gen_arch_intrinsics("IntrinsicsS390") {
   intrinsic_prefix = "s390"
 }
 
+gen_arch_intrinsics("IntrinsicsSPIRV") {
+  intrinsic_prefix = "spv"
+}
+
 gen_arch_intrinsics("IntrinsicsVE") {
   intrinsic_prefix = "ve"
 }
@@ -128,6 +132,7 @@ group("public_tablegen") {
     ":IntrinsicsR600",
     ":IntrinsicsRISCV",
     ":IntrinsicsS390",
+    ":IntrinsicsSPIRV",
     ":IntrinsicsVE",
     ":IntrinsicsWebAssembly",
     ":IntrinsicsX86",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 0cce6558e009..00e1888da64d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -23,6 +23,7 @@ static_library("Instrumentation") {
     "InstrProfiling.cpp",
     "Instrumentation.cpp",
     "KCFI.cpp",
+    "PGOForceFunctionAttrs.cpp",
     "MemProfiler.cpp",
     "MemorySanitizer.cpp",
     "PGOInstrumentation.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index bed26df94e2c..f080c06f10df 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -35,6 +35,7 @@ static_library("Scalar") {
     "InferAddressSpaces.cpp",
     "InferAlignment.cpp",
     "InstSimplifyPass.cpp",
+    "JumpTableToSwitch.cpp",
     "JumpThreading.cpp",
     "LICM.cpp",
     "LoopAccessAnalysisPrinter.cpp",
@@ -50,7 +51,6 @@ static_library("Scalar") {
     "LoopLoadElimination.cpp",
     "LoopPassManager.cpp",
     "LoopPredication.cpp",
-    "LoopRerollPass.cpp",
     "LoopRotation.cpp",
     "LoopSimplifyCFG.cpp",
     "LoopSink.cpp",
diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
index 9819843c563c..82bc61dd8c3a 100644
--- a/mlir/docs/Diagnostics.md
+++ b/mlir/docs/Diagnostics.md
@@ -119,6 +119,14 @@ op->emitError() << anotherOp;
 op->emitRemark() << anotherOp;
 ```
 
+To make a custom type compatible with Diagnostics, one must implement the
+following friend function.
+
+```c++
+friend mlir::Diagnostic &operator<<(
+    mlir::Diagnostic &diagnostic, const MyType &foo);
+```
+
 ### Attaching notes
 
 Unlike many other compiler frameworks, notes in MLIR cannot be emitted directly.
diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md
index 7a299a48600b..b0fdf085854c 100644
--- a/mlir/docs/Tutorials/transform/Ch1.md
+++ b/mlir/docs/Tutorials/transform/Ch1.md
@@ -6,7 +6,7 @@ The Transform dialect allows one to precisely target transformations at specific
 
 Transform IR operations operate on values that may be associated with payload IR operations, values or attributes. We call the first two kinds of values operation and value handles, respectively. We call the last kind of values parameters.
 
-The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation merely applies other transform operations listed in its body one after the other.
+The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation, `transform.named_sequence` merely applies other transform operations listed in its body one after the other, similarly to a function or a macro.
 
 Let us illustrate this with a simple sequence of transformations on the common “fully connected + bias + ReLU” ML layer, which boils down to performing a matrix multiplication, followed by an (elementwise) matrix addition and taking an elementwise maximum with 0. This can be expressed using the following IR:
 
@@ -14,7 +14,7 @@ Let us illustrate this with a simple sequence of transformations on the common �
 func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
                    %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>)
                    -> tensor<512x512xf32> {
-  // Matrix-matrix multiplication.  
+  // Matrix-matrix multiplication.
   %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>)
                           outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32>
 
@@ -22,7 +22,7 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
   %biased = linalg.elemwise_binary { fun = #linalg.binary_fn<add> }
     ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>)
     outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32>
-  
+
   // Elementwise max with 0 (ReLU).
   %c0f = arith.constant 0.0 : f32
   %relued = linalg.elemwise_binary { fun = #linalg.binary_fn<max_signed> }
@@ -37,30 +37,34 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
 For performance reasons, we would like to tile and fuse these operations to exploit cache locality. This is a sequence of transformations that need to be performed one after another, so we naturally start with the corresponding top-level transform operation.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">):
+    transform.yield
+  }
 }
 ```
 
 There are several aspects worth noticing in this operation.
 
-The first entry block argument is mandatory for top-level transform operations and is associated with the top-level payload operation that sequence is applied to, for example, a module or a function. This operation is specified when calling `applyTransforms`.
+Its special name, `@__transform_main` and the first argument are mandated by the interpreter pass, similarly to how the entry point of C programs needs to be called `main` and may have the `int (int argc, char** argv)` signature. This argument will be associated with the top-level payload operation, most often the operation that the pass is applied to. Note that none of this is required when applying the transformation _programmatically_ via `applyTransforms` or `applyNamedSequence`.
 
 The remaining entry block arguments are optional and can be associated with payload attributes, operations or values that are useful in the sequence. These are also specified when calling `applyTransforms`. In our case, we are interested in the matrix multiplication and elementwise operations that we are going to tile and fuse.
 
 All value handles have Transform dialect types. These types specify certain properties of the payload IR entities associated with them. In this example, `transform.any_op` indicates that the handle is associated with arbitrary payload operations. On the contrary, `transform.op<"X">` indicates that the handle is associated _only_ with payload operations of kind `X`. These constraints are verified when the handle/payload association is created. For entry block arguments of top-level transform operations, this happens early in the `applyTransforms` function. If the constraints are not satisfied, the transform application fails and produces diagnostics for the user.
 
+Finally, the operation is wrapped in a module with the `transform.with_named_sequence` attribute that triggers all necessary verifications if multiple named sequences exist.
+
 ## Failure Propagation
 
-Speaking about diagnostics, the `sequence` operation itself has a mandatory attribute specifying the failure propagation mode. There are two options:
+The Transform dialect infrastructure has a particular mechanism for handling diagnostics that supports recoverable errors. It is best understood by considering the (unnamed) sequence operation that has a mandatory attribute specifying the failure propagation mode. There are two options:
 
 *   “propagate” makes the sequence transformation fail if any of the nested transformation fails;
 *   “suppress” makes the sequence succeed even if one of the nested transformations fails, but without attempting to perform the transformations following the failed one in the sequence.
 
-This latter allows the transformation to continue despite (recoverable) errors. As we are only building the transformation, it is preferable to propagate failures so we know when something did not apply.
+This latter allows the transformation script surrounding the sequence to continue despite errors within the sequence, assuming they are recoverable. As we are only building the transformation script, it is preferable to propagate failures so we know when something did not apply.
 
 To check or debug a transform sequence, it is possible to print various entities associated with the transform IR values. For example, we can print the operations associated with the handles:
 
@@ -83,27 +87,26 @@ Since we don’t want to recompile the compiler every time we change a transform
 
 
 ```sh
-$ mlir-opt matmul.mlir --pass-pipeline="
-    builtin.module(test-transform-dialect-interpreter{
-        bind-first-extra-to-ops=linalg.matmul
-        bind-second-extra-to-ops=linalg.elemwise_binary})"
+$ mlir-opt sequence.mlir --pass-pipeline="
+    builtin.module(transform-interpreter{
+        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary})"
 ```
 
-The `matmul.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter will find the first top-level transform operation in the root operation of the pass (the module in our case) and apply it to that root operation. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks:
+The `sequence.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter pass will apply the `@__transform_main` named sequence to the anchor operation of the pass. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks:
 
 ```sh
-matmul.mlir:7:13: remark: matmul
+sequence.mlir:7:13: remark: matmul
   %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>)
             ^
-matmul.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
-matmul.mlir:10:13: remark: elemwise_binaries
+sequence.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:10:13: remark: elemwise_binaries
   %biased = linalg.elemwise_binary { fun = #linalg.binary_fn<add> }
             ^
-matmul.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
-matmul.mlir:14:13: remark: elemwise_binaries
+sequence.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:14:13: remark: elemwise_binaries
   %relued = linalg.elemwise_binary { fun = #linalg.binary_fn<max_signed> }
             ^
-matmul.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn<max_signed>} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
+sequence.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn<max_signed>} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32>
 ```
 
 Note that `%arg2` is associated with both elementwise payload operations. Any handle is associated with a list of entities. Individual transformations may or may not care about the order of elements in that list.
@@ -114,26 +117,33 @@ Note that `%arg2` is associated with both elementwise payload operations. Any ha
 Now that we have handles to the operations we want to transform, we are ready to apply the transformations. Let us first try tiling the matmul operation itself.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1
+                    tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">)
+     -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 ```
 
-The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtiletoforallop):
+The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_using_forall-transformtileusingforallop):
 
-*   A handle to the `scf.forall` “multi-for” loop around tensors.
 *   A handle to `linalg.generic` operating on the subset of the original data.
+*   A handle to the `scf.forall` “multi-for” loop around tensors.
 
 Running this transformation with the same command as above expectedly produces the tiled code.
 
 ```mlir
-func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2: tensor<512x512xf32>, %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> {
+func.func @fc_relu(%arg0: tensor<512x512xf32>,
+                   %arg1: tensor<512x512xf32>,
+                   %arg2: tensor<512x512xf32>,
+                   %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = scf.forall (%arg4, %arg5) in (128, 16) shared_outs(%arg6 = %arg3) -> (tensor<512x512xf32>) {
     %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
@@ -144,7 +154,7 @@ func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2
                        : tensor<512x512xf32> to tensor<512x32xf32>
     %extracted_slice_1 = tensor.extract_slice %arg6[%3, %4] [4, 32] [1, 1]
                       : tensor<512x512xf32> to tensor<4x32xf32>
-    %5 = linalg.matmul 
+    %5 = linalg.matmul
          ins(%extracted_slice, %extracted_slice_0
              : tensor<4x512xf32>, tensor<512x32xf32>)
          outs(%extracted_slice_1 : tensor<4x32xf32>) -> tensor<4x32xf32>
@@ -168,78 +178,79 @@ Besides producing new handles, the tiling transform operation _consumes_ the ope
 
 ## Handle Invalidation and Expensive Checks Mode
 
-Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
+Undefined behavior is difficult to grapple with when it does happen, so the Transform dialect interpreter defaults to performing a set of additional, potentially expensive, checks that detect most undefined behavior in the transform IR. For example, if we wanted to  use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-
-  // This is trying to use an invalidated handle leading to undefined behavior.
-  transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+        : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+
+    // This is trying to use an invalidated handle leading to undefined behavior.
+    transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
+    transform.yield
+  }
 }
 ```
 
 However, with the expensive checks enabled in the interpreter, a nice diagnostic is produced:
 
 ```sh
-$ mlir-opt matmul.mlir --pass-pipeline="
-    builtin.module(test-transform-dialect-interpreter{
-        bind-first-extra-to-ops=linalg.matmul
-        bind-second-extra-to-ops=linalg.elemwise_binary
-        enable-expensive-checks})"
-```
-
-```sh
-matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
+sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
   transform.debug.emit_remark_at %mm, "elemwise_binaries" : !transform.any_op
   ^
-matmul.mlir:26:9: note: handle to invalidated ops
+sequence.mlir:26:9: note: handle to invalidated ops
   %mm = transform.cast %matmul : !transform.op<"linalg.matmul"> to !transform.any_op
         ^
-matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
+sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
   %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32]
 ```
 
-One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead? 
+When compile-time performance is a concern, and the transformation sequence is sufficiently stable, it is possible to disable expensive checks in the interpreter for improved performance by providing the `disable-expensive-checks` option to the pass or by setting the corresponding flag in the `TransformOptions` passed into `applyTransforms`.
+
+One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead?
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // We can cast one type to another as long as operations are compatible
-  // with both types. This creates "aliasing" handles.
-  %casted = transform.cast %arg1 : !transform.op<"linalg.matmul">
-      to !transform.any_op
-
-  // The actual tiling transformation takes tile sizes as attributes.
-  %loop, %tiled = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
-
-  // Consuming an operand invalidates the consumed handle and any other handle that is
-  // associated with the same payload operations, or payload operations nested in them.
-  transform.debug.emit_remark_at %casted, "remark"
-    : !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // We can cast one type to another as long as operations are compatible
+    // with both types. This creates "aliasing" handles.
+    %casted = transform.cast %arg1 : !transform.op<"linalg.matmul">
+        to !transform.any_op
+
+    // The actual tiling transformation takes tile sizes as attributes.
+    %loop, %tiled = transform.structured.tile_using_forall %arg1
+                    tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">)
+     -> (!transform.any_op, !transform.any_op)
+
+    // Consuming an operand invalidates the consumed handle and any other handle
+    // that is associated with the same payload operations, or payload
+    // operations nested in them.
+    transform.debug.emit_remark_at %casted, "remark"
+      : !transform.any_op
+    transform.yield
+  }
 }
 ```
 
 Both `%arg1` and `%casted` reference the same payload operation. Extending the reference analogy, these references alias. Naturally, when the payload operation is erased, all references to it become dangling. This is also the case for handles. In fact, consuming an operand invalidates the operand handle as well as any other handle that is associated with any of the same payload operations. The payload IR consideration is recursive: a handle associated with a payload operation _nested_ in the erased one is also invalidated (because erasing the operation also erases its regions and all contained operations). The expensive-checks mode can also handle this case.
 
 ```sh
-matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
+sequence.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op
   transform.debug.emit_remark_at %matmul, "elemwise_binaries" : !transform.op<"linalg.matmul">
   ^
-matmul.mlir:21:29: note: handle to invalidated ops
+sequence.mlir:21:29: note: handle to invalidated ops
 ^bb0(%root: !transform.any_op, %matmul: !transform.op<"linalg.matmul">, %elemwise: !transform.op<"linalg.elemwise_binary">):
                             ^
-matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
+sequence.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them
   %loop, %tiled = transform.structured.tile_using_forall %mm tile_sizes [4, 32]
 ```
 
@@ -248,39 +259,41 @@ matmul.mlir:27:19: note: invalidated by this transform op that consumes its oper
 Going back to the transformation sequence, we have tiled the matrix multiplication, but we also want to tile and fuse the elementwise operations. The typical way of doing in the structured operations paradigm is to tile the last operation in some acyclic dataflow graph, and then progressively fuse the operations that produce its operands. This removes the need to explicitly tile all operations as fusion can adapt their sizes and inject recomputation if desired. So instead of tiling the matmul operation, we are going to tile the last operation in the chain, and then fuse the preceding operations into the loops produced by tiling.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2
-      : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It
-  // produces a handle to the loop generated during tiling.
-  %tiled_max, %loop =
-      transform.structured.tile_using_forall %max tile_sizes [8, 32]
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one by one. This requires the operation that is being fused to
-  // define the value used within the loop, so the order of such fusions is
-  // important. We could also use "transform.merge_handles" to obtain a single
-  // handle to all operations and give it to `fuse_into_containing_op` that
-  // would take care of the ordering in this case.
-  %add_fused, %loop_0 =
-      transform.structured.fuse_into_containing_op %add into %loop
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop_1 =
-      transform.structured.fuse_into_containing_op %arg1 into %loop_0
-        : (!transform.op<"linalg.matmul">, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
 
-  transform.yield
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %tiled_max, %loop =
+        transform.structured.tile_using_forall %max tile_sizes [8, 32]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one by one. This requires the operation that is being fused to
+    // define the value used within the loop, so the order of such fusions is
+    // important. We could also use "transform.merge_handles" to obtain a single
+    // handle to all operations and give it to `fuse_into_containing_op` that
+    // would take care of the ordering in this case.
+    %add_fused, %loop_0 =
+        transform.structured.fuse_into_containing_op %add into %loop
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop_1 =
+        transform.structured.fuse_into_containing_op %arg1 into %loop_0
+          : (!transform.op<"linalg.matmul">, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
 }
 ```
 
@@ -291,64 +304,68 @@ This achieves the desired tiling and fusion.
 Finally, let us assume there exists an efficient microkernel, or a hardware instruction expressed as an intrinsic function, for a 4x4 matrix multiplication. For this purpose, we need to tile the fused operation to the desired size, and then outline it. The resulting function call can then be replaced with a call to the microkernel.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2
-      : (!transform.op<"linalg.elemwise_binary">)
-        -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It
-  // produces a handle to the loop generated during tiling.
-  %tiled, %loop  = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one by one. This requires the operation that is being fused to
-  // define the value used within the loop, so the order of such fusions is
-  // important. We could also use "transform.merge_handles" to obtain a single
-  // handle to all operations and give it to `fuse_into_containing_op` that
-  // would take care of the ordering in this case.
-  %add_fused, %loop_0 =
-      transform.structured.fuse_into_containing_op %add into %loop
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop_1 =
-      transform.structured.fuse_into_containing_op %arg1 into %loop_0
-        : (!transform.op<"linalg.matmul">, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
           -> (!transform.any_op, !transform.any_op)
 
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_2, %loop_2 =
-      transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max
+                    tile_sizes [8, 32]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_3 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
 
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_, %outline_target =
-      transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul_fused_2
-      into %outline_target
-        : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  transform.yield
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one by one. This requires the operation that is being fused to
+    // define the value used within the loop, so the order of such fusions is
+    // important. We could also use "transform.merge_handles" to obtain a single
+    // handle to all operations and give it to `fuse_into_containing_op` that
+    // would take care of the ordering in this case.
+    %add_fused, %loop_0 =
+        transform.structured.fuse_into_containing_op %add into %loop
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop_1 =
+        transform.structured.fuse_into_containing_op %arg1 into %loop_0
+          : (!transform.op<"linalg.matmul">, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the
+    // transform dialect. Otherwise, it is difficult to differentiate "add" and
+    // "max", both of which having the same kind.
+    %tiled_2, %loop_2 =
+        transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_3 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+
+    // Since outlining is currently only implemented for region-holding
+    // operations such as loops, use tiling to size 1 to materialize the outer
+    // loop that is going to be outlined.
+    %_, %outline_target =
+        transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul_fused_2
+        into %outline_target
+          : (!transform.any_op, !transform.any_op)
+            -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target
+                   {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+
+    transform.yield
+  }
 }
 ```
 
diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md
index ac6d7d42523e..1aaefd2f2c30 100644
--- a/mlir/docs/Tutorials/transform/Ch2.md
+++ b/mlir/docs/Tutorials/transform/Ch2.md
@@ -10,37 +10,40 @@ The Transform dialect uses the dialect extension mechanism to allow additional o
 // In MyExtension.cpp.
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 
-// Define a new Transform dialect extension. This uses the CRTP idiom to identify
-// extensions.
+// Define a new Transform dialect extension. This uses the CRTP idiom to
+// identify extensions.
 class MyExtension : public ::mlir::transform::TransformDialectExtension<MyExtension> {
 public:
   // The extension must derive the base constructor.
   using Base::Base;
 
-  // This function initializes the extension, similarly to `initialize` in dialect 
-  // definitions. List individual operations and dependent dialects here.
+  // This function initializes the extension, similarly to `initialize` in
+  // dialect  definitions. List individual operations and dependent dialects
+  // here.
   void init();
 };
 
 void MyExtension::init() {
-  // Similarly to dialects, an extension can declare a dependent dialect. This dialect 
-  // will be loaded along with the extension and, therefore, along with the Transform 
-  // dialect. Only declare as dependent the dialects that contain the attributes or 
-  // types used by transform operations. Do NOT declare as dependent the dialects 
-  // produced during the transformation.
+  // Similarly to dialects, an extension can declare a dependent dialect. This
+  // dialect will be loaded along with the extension and, therefore, along with
+  // the Transform  dialect. Only declare as dependent the dialects that contain
+  // the attributes or types used by transform operations. Do NOT declare as
+  // dependent the dialects produced during the transformation.
+  //
   // declareDependentDialect<MyDialect>();
 
-  // When transformations are applied, they may produce new operations from previously
-  // unloaded dialects. Typically, a pass would need to declare itself dependent on
-  // the dialects containing such new operations. To avoid confusion with the dialects
-  // the extension itself depends on, the Transform dialects differentiates between:
+  // When transformations are applied, they may produce new operations from
+  // previously unloaded dialects. Typically, a pass would need to declare
+  // itself dependent on the dialects containing such new operations. To avoid
+  // confusion with the dialects the extension itself depends on, the Transform
+  // dialects differentiates between:
   //   - dependent dialects, which are used by the transform operations, and
-  //   - generated dialects, which contain the entities (attributes, operations, 
-  //     types) that may be produced by applying the transformation even when not
-  //     present in the original payload IR.
-  // In the following chapter, we will be add operations that generate function calls
-  // and structured control flow operations, so let's declare the corresponding
-  // dialects as generated.
+  //   - generated dialects, which contain the entities (attributes, operations,
+  //     types) that may be produced by applying the transformation even when
+  //     not present in the original payload IR.
+  // In the following chapter, we will be add operations that generate function
+  // calls and structured control flow operations, so let's declare the
+  // corresponding dialects as generated.
   declareGeneratedDialect<::mlir::scf::SCFDialect>();
   declareGeneratedDialect<::mlir::func::FuncDialect>();
 
@@ -89,7 +92,7 @@ mlir_tablegen(MyExtension.cpp.inc -gen-op-defs)
 # Add a CMakeTarget we can depend on to ensure the generation happens before the compilation.
 add_public_tablegen_target(MyExtensionIncGen)
 
-# Don't forget to generate the documentation, this will produce a MyExtension.md under 
+# Don't forget to generate the documentation, this will produce a MyExtension.md under
 # Dialects.
 add_mlir_doc(MyExtension MyExtension Dialects/ -gen-op-doc)
 ```
@@ -103,7 +106,8 @@ add_mlir_library(
   # Built from the following source files.
   MyExtension.cpp
 
-  # Make sure ODS declaration and definitions are generated before compiling this.
+  # Make sure ODS declaration and definitions are generated before compiling
+  # this.
   DEPENDS
   MyExtensionIncGen
 
@@ -136,10 +140,10 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha
 void MyExtension::init() {
   // …
 
-  // Finally, we register the additional transform operations with the dialect. List all 
-  // operations generated from ODS. This call will perform additional checks that the 
-  // operations implement the transform and memory effect interfaces required by the 
-  // dialect interpreter and assert if they do not.
+  // Finally, we register the additional transform operations with the dialect.
+  // List all  operations generated from ODS. This call will perform additional
+  // checks that the  operations implement the transform and memory effect
+  // interfaces required by the dialect interpreter and assert if they do not.
   registerTransformOps<
 #define GET_OP_LIST
 #include "MyExtension.cpp.inc"
@@ -154,34 +158,36 @@ With this setup, we are now ready to define the new transform operation to rewri
 ```tablegen
 // In MyExtension.td.
 
-// Define the new operation. By convention, prefix its name with the name of the dialect 
-// extension, "my.". The full operation name will be further prefixed with "transform.".
+// Define the new operation. By convention, prefix its name with the name of the
+// dialect  extension, "my.". The full operation name will be further prefixed
+// with "transform.".
 def ChangeCallTargetOp : Op<Transform_Dialect, "my.change_call_target",
-    // Indicate that the operation implements the required TransformOpInterface and
-    // MemoryEffectsOpInterface.
+    // Indicate that the operation implements the required TransformOpInterface
+    // and MemoryEffectsOpInterface.
     [DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
-  // Provide a brief and a full description. It is recommended that the latter describes 
-  // the effects on the operands and how the operation processes various failure modes.
+  // Provide a brief and a full description. It is recommended that the latter
+  // describes the effects on the operands and how the operation processes
+  // various failure modes.
   let summary = "Changes the callee of a call operation to the specified one";
   let description = [{
-    For each `func.call` payload operation associated with the handle, changes its 
-    callee to be the symbol whose name is provided as an attribute to this operation.
+    For each `func.call` payload operation associated with the handle, changes
+    its callee to be the symbol whose name is provided as an attribute to this operation.
 
-    Generates a silenceable failure if the operand is associated with payload operations 
-    that are not `func.call`.
-    Only reads the operand.
+    Generates a silenceable failure if the operand is associated with payload operations that are not `func.call`. Only reads the operand.
   }];
 
-  // The arguments include the handle to the payload operations and the attribute that 
-  // specifies the new callee. The handle must implement TransformHandleTypeInterface.   
-  // We use a string attribute as the symbol may not exist in the transform IR so the 
-  // verification may fail. 
+  // The arguments include the handle to the payload operations and the
+  // attribute that specifies the new callee. The handle must implement
+  // TransformHandleTypeInterface.
+  // We use a string attribute as the symbol may not exist in the transform IR
+  // so the verification may fail.
   let arguments = (ins
     TransformHandleTypeInterface:$call,
     StrAttr:$new_target);
 
-  // The results are empty as the transformation does not produce any new payload.
+  // The results are empty as the transformation does not produce any new
+  // payload.
   let results = (outs);
 
   // Provide nice syntax.
@@ -224,8 +230,8 @@ must be modified with the provided rewriter.
     // It can also carry additional user-defined state.
     ::mlir::transform::TransformState &state) {
 
-  // First, we need to obtain the list of payload operations that are associated with
-  // the operand handle.
+  // First, we need to obtain the list of payload operations that are associated
+  // with the operand handle.
   auto payload = state.getPayloadOps(getCall());
 
   // Then, we iterate over the list of operands and call the actual IR-mutating
@@ -280,56 +286,66 @@ void registerMyExtension(::mlir::DialectRegistry &registry) {
 After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is.
 
 ```mlir
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %loop, %tiled = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-  %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused into %loop_2
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target
-      : (!transform.any_op, !transform.any_op) -> !transform.any_op
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.any_op
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2
+        : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+
+    // The actual tiling transformation takes tile sizes as attributes. It
+    // produces a handle to the loop generated during tiling.
+    %loop, %tiled = transform.structured.tile_using_forall %max
+                    tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to
+    // `fuse_into_containing_op` that would take care of the ordering in this
+    // case.
+    %add_fused = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %matmul_fused = transform.structured.fuse_into_containing_op %arg1
+                    into %loop
+        : (!transform.op<"linalg.matmul">, !transform.any_op)
+       -> !transform.any_op
+
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the
+    // transform dialect. Otherwise, it is difficult to differentiate "add" and
+    // "max", both of which having the same kind.
+    %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused
+                        tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused
+                      into %loop_2
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+
+    // Since outlining is currently only implemented for region-holding
+    // operations such as loops, use tiling to size 1 to materialize the outer
+    // loop that is going to be outlined.
+    %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target
+        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %func, %call = transform.loop.outline %outline_target
+                   {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.any_op
+
+    transform.yield
+  }
 }
 ```
 
diff --git a/mlir/docs/Tutorials/transform/Ch3.md b/mlir/docs/Tutorials/transform/Ch3.md
index 84251df383d8..fa788d13e205 100644
--- a/mlir/docs/Tutorials/transform/Ch3.md
+++ b/mlir/docs/Tutorials/transform/Ch3.md
@@ -79,7 +79,7 @@ def CallOpInterfaceHandle
       // The type must implement `TransformHandleTypeInterface`.
       [DeclareTypeInterfaceMethods<TransformHandleTypeInterface>]> {
 
-  // The usual components of a type such as description, mnemonic and assembly format 
+  // The usual components of a type such as description, mnemonic and assembly format
   // should be provided.
   let summary = "handle to payload operations implementing CallOpInterface";
   let mnemonic = "my.call_op_interface";
@@ -87,7 +87,7 @@ def CallOpInterfaceHandle
 }
 ```
 
-We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case. 
+We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case.
 
 To finalize the definition of a transform type, one must implement the interface methods.
 
@@ -109,9 +109,9 @@ mlir::transform::CallOpInterfaceHandleType::checkPayload(
     if (llvm::isa<mlir::CallOpInterface>(op))
       continue;
 
-    // By convention, these verifiers always emit a silenceable failure since they are 
+    // By convention, these verifiers always emit a silenceable failure since they are
     // checking a precondition.
-    DiagnosedSilenceableFailure diag = emitSilenceableError(loc) 
+    DiagnosedSilenceableFailure diag = emitSilenceableError(loc)
         << "expected the payload operation to implement CallOpInterface";
     diag.attachNote(op->getLoc()) << "offending operation";
     return diag;
@@ -129,8 +129,8 @@ Additional attributes and types need to be registered in the extension, next to
 // In MyExtension.cpp.
 
 void MyExtension::init() {
-  // …
-  
+  // ...
+
   registerTypes<
 #define GET_TYPEDEF_LIST
 #include "MyExtensionTypes.cpp.inc"
diff --git a/mlir/docs/Tutorials/transform/Ch4.md b/mlir/docs/Tutorials/transform/Ch4.md
index 9c9aba19d574..ad5221c6f6cc 100644
--- a/mlir/docs/Tutorials/transform/Ch4.md
+++ b/mlir/docs/Tutorials/transform/Ch4.md
@@ -205,7 +205,7 @@ transform.named_sequence @__transform_main(
     %root: !transform.any_op {transform.readonly}) {
   // Collect groups of operations that match the criteria specified in the
   // named sequence.
-  %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root 
+  %matmul, %el1, %el2 = transform.collect_matching @match_matmul_elemwise in %root
     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
   %elemwise = transform.merge_handles %el1, %el2 : !transform.any_op
 
diff --git a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
index 3a975313f93f..874ad78c7e83 100644
--- a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp
@@ -12,6 +12,7 @@
 
 #include "MyExtension.h"
 
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
@@ -20,14 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-// Forward declarations of test passes that used in this chapter for
-// illustrative purposes. Test passes are not directly exposed for use in
-// binaries other than mlir-opt, which is too big to serve as an example.
-namespace mlir::test {
-void registerTestTransformDialectEraseSchedulePass();
-void registerTestTransformDialectInterpreterPass();
-} // namespace mlir::test
-
 namespace test {
 void registerTestTransformDialectExtension(mlir::DialectRegistry &);
 } // namespace test
@@ -39,22 +32,15 @@ int main(int argc, char **argv) {
   mlir::registerAllExtensions(registry);
   registerMyExtension(registry);
 
+  // Register transform interpreter pass.
+  mlir::transform::registerInterpreterPass();
+
   // Register a handful of cleanup passes that we can run to make the output IR
   // look nicer.
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::registerSymbolDCEPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  mlir::test::registerTestTransformDialectEraseSchedulePass();
-  mlir::test::registerTestTransformDialectInterpreterPass();
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test passes, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch2", registry)
                  .succeeded()
diff --git a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
index 3c348c663aba..c9150c64a716 100644
--- a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp
@@ -12,6 +12,7 @@
 
 #include "MyExtension.h"
 
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
@@ -20,18 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-// Forward declarations of test passes that used in this chapter for
-// illustrative purposes. Test passes are not directly exposed for use in
-// binaries other than mlir-opt, which is too big to serve as an example.
-namespace mlir::test {
-void registerTestTransformDialectEraseSchedulePass();
-void registerTestTransformDialectInterpreterPass();
-} // namespace mlir::test
-
-namespace test {
-void registerTestTransformDialectExtension(mlir::DialectRegistry &);
-} // namespace test
-
 int main(int argc, char **argv) {
   // Register all "core" dialects and our transform dialect extension.
   mlir::DialectRegistry registry;
@@ -39,22 +28,15 @@ int main(int argc, char **argv) {
   mlir::registerAllExtensions(registry);
   registerMyExtension(registry);
 
+  // Register the interpreter pass.
+  mlir::transform::registerInterpreterPass();
+
   // Register a handful of cleanup passes that we can run to make the output IR
   // look nicer.
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::registerSymbolDCEPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  mlir::test::registerTestTransformDialectEraseSchedulePass();
-  mlir::test::registerTestTransformDialectInterpreterPass();
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test passes, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch3", registry)
                  .succeeded()
diff --git a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
index 10190664b51c..03c84bdbccb8 100644
--- a/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
+++ b/mlir/examples/transform/Ch4/transform-opt/transform-opt.cpp
@@ -21,10 +21,6 @@
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
 
-namespace test {
-void registerTestTransformDialectExtension(mlir::DialectRegistry &);
-} // namespace test
-
 int main(int argc, char **argv) {
   // Register all "core" dialects and our transform dialect extension.
   mlir::DialectRegistry registry;
@@ -39,14 +35,6 @@ int main(int argc, char **argv) {
   mlir::registerSymbolDCEPass();
   mlir::transform::registerInterpreterPass();
 
-  // Register the test passes.
-#ifdef MLIR_INCLUDE_TESTS
-  test::registerTestTransformDialectExtension(registry);
-#else
-  llvm::errs() << "warning: MLIR built without test extension, interpreter "
-                  "testing will not be available\n";
-#endif // MLIR_INCLUDE_TESTS
-
   // Delegate to the MLIR utility for parsing and pass management.
   return mlir::MlirOptMain(argc, argv, "transform-opt-ch4", registry)
                  .succeeded()
diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 42d8400cb5e9..d549f5dddc13 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -28,20 +28,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor);
 typedef uint64_t MlirSparseTensorLevelType;
 
 enum MlirBaseSparseTensorLevelType {
-  MLIR_SPARSE_TENSOR_LEVEL_DENSE = 4,                   // 0b00001_00
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 8,              // 0b00010_00
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 9,           // 0b00010_01
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 10,          // 0b00010_10
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 11,       // 0b00010_11
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 16,              // 0b00100_00
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 17,           // 0b00100_01
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 18,           // 0b00100_10
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 19,        // 0b00100_11
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 32,       // 0b01000_00
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 33,    // 0b01000_01
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 34,    // 0b01000_10
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 35, // 0b01000_11
-  MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR = 64,        // 0b10000_00
+  MLIR_SPARSE_TENSOR_LEVEL_DENSE = 0x000000010000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000020000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU = 0x000000020001,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO = 0x000000020002,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO = 0x000000020003,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000040000,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU = 0x000000040001,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO = 0x000000040002,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO = 0x000000040003,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000080000,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU = 0x000000080001,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO = 0x000000080002,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO = 0x000000080003,
+  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000100000,
 };
 
 //===----------------------------------------------------------------------===//
@@ -84,6 +84,16 @@ mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr);
 MLIR_CAPI_EXPORTED int
 mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr);
 
+MLIR_CAPI_EXPORTED unsigned
+mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType);
+
+MLIR_CAPI_EXPORTED unsigned
+mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType);
+
+MLIR_CAPI_EXPORTED MlirSparseTensorLevelType
+mlirSparseTensorEncodingAttrBuildLvlType(
+    enum MlirBaseSparseTensorLevelType lvlType, unsigned n, unsigned m);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 225e4d3194e2..edcfcfd830c4 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -67,7 +67,8 @@ def AffineApplyOp : Affine_Op<"apply", [Pure]> {
     OpBuilder<(ins "ArrayRef<AffineExpr> ":$exprList,"ValueRange":$mapOperands),
     [{
       build($_builder, $_state, $_builder.getIndexType(),
-            AffineMap::inferFromExprList(exprList).front(), mapOperands);
+            AffineMap::inferFromExprList(exprList, $_builder.getContext())
+                                        .front(), mapOperands);
     }]>
   ];
 
diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
index e3f3d9e62e8f..f237f232487e 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
@@ -59,14 +59,15 @@ class ArmSVE_Op<string mnemonic, list<Trait> traits = []> :
 class ArmSVE_IntrOp<string mnemonic,
                     list<Trait> traits = [],
                     list<int> overloadedOperands = [],
-                    list<int> overloadedResults = []> :
+                    list<int> overloadedResults = [],
+                    int numResults = 1> :
   LLVM_IntrOpBase</*Dialect dialect=*/ArmSVE_Dialect,
                   /*string opName=*/"intr." # mnemonic,
                   /*string enumName=*/"aarch64_sve_" # !subst(".", "_", mnemonic),
                   /*list<int> overloadedResults=*/overloadedResults,
                   /*list<int> overloadedOperands=*/overloadedOperands,
                   /*list<Trait> traits=*/traits,
-                  /*int numResults=*/1>;
+                  /*int numResults=*/numResults>;
 
 class ArmSVE_IntrBinaryOverloadedOp<string mnemonic,
                                     list<Trait> traits = []>:
@@ -410,4 +411,24 @@ def ConvertToSvboolIntrOp :
     /*overloadedResults=*/[]>,
     Arguments<(ins SVEPredicate:$mask)>;
 
+// Note: This multi-vector intrinsic requires SME2.
+def ZipX2IntrOp : ArmSVE_IntrOp<"zip.x2",
+    /*traits=*/[],
+    /*overloadedOperands=*/[0],
+    /*overloadedResults=*/[],
+    /*numResults=*/2>,
+    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
+                   Arg<AnyScalableVector, "v2">:$v2)>;
+
+// Note: This multi-vector intrinsic requires SME2.
+def ZipX4IntrOp : ArmSVE_IntrOp<"zip.x4",
+    /*traits=*/[],
+    /*overloadedOperands=*/[0],
+    /*overloadedResults=*/[],
+    /*numResults=*/4>,
+    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
+                   Arg<AnyScalableVector, "v2">:$v2,
+                   Arg<AnyScalableVector, "v3">:$v3,
+                   Arg<AnyScalableVector, "v3">:$v4)>;
+
 #endif // ARMSVE_OPS
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 226a2fbd0856..d8cfeee24663 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -257,6 +257,9 @@ struct BufferizationOptions {
   /// Parameters: Value, memory space, bufferization options
   using UnknownTypeConverterFn = std::function<BaseMemRefType(
       Value, Attribute memorySpace, const BufferizationOptions &)>;
+  // Produce a MemorySpace attribute from a tensor type
+  using DefaultMemorySpaceFn =
+      std::function<std::optional<Attribute>(TensorType t)>;
 
   BufferizationOptions();
 
@@ -296,11 +299,6 @@ struct BufferizationOptions {
   /// bufferized or not.
   bool bufferizeFunctionBoundaries = false;
 
-  /// The default memory space that should be used when it cannot be inferred
-  /// from the context. If case of std::nullopt, bufferization fails when the
-  /// memory space cannot be inferred at any point.
-  std::optional<Attribute> defaultMemorySpace = Attribute();
-
   /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for).
   /// If this flag is set to `false`, those invariants are no longer enforced
   /// with buffer copies.
@@ -351,6 +349,13 @@ struct BufferizationOptions {
   /// used.
   UnknownTypeConverterFn unknownTypeConverterFn = nullptr;
 
+  // Use during type conversion to determine the memory space for memref based
+  // on the original tensor type if the memory space cannot be inferred.
+  // Returning std::nullopt will cause bufferization to fail (useful to indicate
+  // failure to determine memory space for a tensor type).
+  DefaultMemorySpaceFn defaultMemorySpaceFn =
+      [](TensorType t) -> std::optional<Attribute> { return Attribute(); };
+
   /// Seed for the analysis fuzzer. If set to `0`, the fuzzer is deactivated.
   /// Should be used only with `testAnalysisOnly = true`.
   unsigned analysisFuzzerSeed = 0;
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 39cc360cef41..c50fdf397a0f 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -122,6 +122,19 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", []> {
     Variadic<AnyType>:$operands
   );
   let results = (outs Variadic<AnyType>);
+  let builders = [
+    OpBuilder<(ins
+      "::mlir::TypeRange":$resultTypes,
+      "::llvm::StringRef":$callee,
+      "::mlir::ValueRange":$operands,
+      CArg<"::mlir::ArrayAttr", "{}">:$args,
+      CArg<"::mlir::ArrayAttr", "{}">:$template_args), [{
+        build($_builder, $_state, resultTypes, callee, args, template_args,
+            operands);
+      }]
+    >
+  ];
+
   let assemblyFormat = [{
     $callee `(` $operands `)` attr-dict `:` functional-type($operands, results)
   }];
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 37e525a139d4..5a75944af0a4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -59,6 +59,19 @@ def NVVM_Dialect : Dialect {
     /// Get the name of the attribute used to annotate max number of
     /// registers that can be allocated per thread.
     static StringRef getMaxnregAttrName() { return "nvvm.maxnreg"; }
+
+    /// Get the name of the attribute used to annotate kernel arguments that
+    /// are grid constants.
+    static StringRef getGridConstantAttrName() { return "nvvm.grid_constant"; }
+
+    /// Verify an attribute from this dialect on the argument at 'argIndex' for
+    /// the region at 'regionIndex' on the given operation. Returns failure if
+    /// the verification failed, success otherwise. This hook may optionally be
+    /// invoked from any operation containing a region.
+    LogicalResult verifyRegionArgAttribute(Operation *op,
+                                           unsigned regionIndex,
+                                           unsigned argIndex,
+                                           NamedAttribute argAttr) override;
   }];
 
   let useDefaultAttributePrinterParser = 1;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 638e46a2f9c7..962c159e68a2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -192,6 +192,22 @@ def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def ROCDL_SetPrioOp : ROCDL_IntrOp<"s.setprio", [], [], [], 0>,
+  Arguments<(ins I16Attr:$priority)> {
+  let results = (outs);
+  let assemblyFormat = "$priority attr-dict";
+  string llvmBuilder =
+    "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_setprio,builder.getInt16(op.getPriority()));";
+}
+
+def ROCDL_SchedBarrier : ROCDL_IntrOp<"sched.barrier", [], [], [], 0>,
+  Arguments<(ins I32Attr:$mask)> {
+  let results = (outs);
+  let assemblyFormat = "$mask attr-dict";
+  string llvmBuilder =
+    "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_sched_barrier,builder.getInt32(op.getMask()));";
+}
+
 
 //===---------------------------------------------------------------------===//
 // Xdlops intrinsics
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 751edd022883..272bc3116c5f 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -531,6 +531,7 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
index 921c3c3e8c7d..186e83a57580 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
@@ -10,10 +10,8 @@
 #define MLIR_DIALECT_LINALG_TRANSFORMS_HOISTING_H_
 
 namespace mlir {
+class Operation;
 class RewriterBase;
-namespace func {
-class FuncOp;
-} // namespace func
 namespace scf {
 class ForOp;
 } // namespace scf
@@ -43,7 +41,7 @@ namespace linalg {
 ///
 /// WARNING: This hoisting does not model parallelism and is generally incorrect
 /// when used on distributed loops with memref semantics!
-void hoistRedundantVectorTransfers(func::FuncOp func);
+void hoistRedundantVectorTransfers(Operation *root);
 
 } // namespace linalg
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index a0c0d4cfd871..dda8f31e688f 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -609,14 +609,16 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
     phase. Suspended thread resumes execution when the specified phase completes 
     OR before the phase completes following a system-dependent time limit. 
 
+    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
+    wait.
+
     Example:
     ```mlir
-      nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
     ```
-
   }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$phase, Index:$ticks, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phase `,` $ticks attr-dict `:` type($barriers)";  
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
 }
 
 def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ca3635054857..c7a32de256e2 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -134,6 +134,97 @@ def DeclareTargetAttr : OpenMP_Attr<"DeclareTarget", "declaretarget"> {
 }
 
 //===----------------------------------------------------------------------===//
+// 2.19.4 Data-Sharing Attribute Clauses
+//===----------------------------------------------------------------------===//
+
+def DataSharingTypePrivate      : I32EnumAttrCase<"Private", 0, "private">;
+def DataSharingTypeFirstPrivate : I32EnumAttrCase<"FirstPrivate", 1, "firstprivate">;
+
+def DataSharingClauseType : I32EnumAttr<
+    "DataSharingClauseType",
+    "Type of a data-sharing clause",
+    [DataSharingTypePrivate, DataSharingTypeFirstPrivate]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+def DataSharingClauseTypeAttr : EnumAttr<
+    OpenMP_Dialect, DataSharingClauseType, "data_sharing_type"> {
+  let assemblyFormat = "`{` `type` `=` $value `}`";
+}
+
+def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
+  let summary = "Provides declaration of [first]private logic.";
+  let description = [{
+    This operation provides a declaration of how to implement the
+    [first]privatization of a variable. The dialect users should provide
+    information about how to create an instance of the type in the alloc region
+    and how to initialize the copy from the original item in the copy region.
+
+    Examples:
+    ---------
+    * `private(x)` would be emitted as:
+    ```mlir
+    omp.private {type = private} @x.privatizer : !fir.ref<i32> alloc {
+    ^bb0(%arg0: !fir.ref<i32>):
+    %0 = ... allocate proper memory for the private clone ...
+    omp.yield(%0 : !fir.ref<i32>)
+    }
+    ```
+
+    * `firstprivate(x)` would be emitted as:
+    ```mlir
+    omp.private {type = firstprivate} @x.privatizer : !fir.ref<i32> alloc {
+    ^bb0(%arg0: !fir.ref<i32>):
+    %0 = ... allocate proper memory for the private clone ...
+    omp.yield(%0 : !fir.ref<i32>)
+    } copy {
+    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    // %arg0 is the original host variable. Same as for `alloc`.
+    // %arg1 represents the memory allocated in `alloc`.
+    ... copy from host to the privatized clone ....
+    omp.yield(%arg1 : !fir.ref<i32>)
+    }
+    ```
+
+    There are no restrictions on the body except for:
+    - The `alloc` region has a single argument.
+    - The `copy` region has 2 arguments.
+    - Both regions are terminated by `omp.yield` ops.
+    The above restrictions and other obvious restrictions (e.g. verifying the
+    type of yielded values) are verified by the custom op verifier. The actual
+    contents of the blocks inside both regions are not verified.
+
+    Instances of this op would then be used by ops that model directives that
+    accept data-sharing attribute clauses.
+
+    The $sym_name attribute provides a symbol by which the privatizer op can be
+    referenced by other dialect ops.
+
+    The $type attribute is the type of the value being privatized.
+
+    The $data_sharing_type attribute specifies whether privatizer corresponds
+    to a `private` or a `firstprivate` clause.
+  }];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<AnyType>:$type,
+                       DataSharingClauseTypeAttr:$data_sharing_type);
+
+  let regions = (region MinSizedRegion<1>:$alloc_region,
+                        AnyRegion:$copy_region);
+
+  let assemblyFormat = [{
+    $data_sharing_type $sym_name `:` $type
+      `alloc` $alloc_region
+      (`copy` $copy_region^)?
+      attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // 2.6 parallel Construct
 //===----------------------------------------------------------------------===//
 
@@ -191,11 +282,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
     unsigned getNumReductionVars() { return getReductionVars().size(); }
   }];
   let assemblyFormat = [{
-    oilist( `reduction` `(`
-              custom<ReductionVarList>(
-                $reduction_vars, type($reduction_vars), $reductions
-              ) `)`
-          | `if` `(` $if_expr_var `:` type($if_expr_var) `)`
+    oilist(
+          `if` `(` $if_expr_var `:` type($if_expr_var) `)`
           | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
@@ -203,7 +291,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
-    ) $region attr-dict
+    ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars), $reductions) attr-dict
   }];
   let hasVerifier = 1;
 }
@@ -612,7 +700,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
 def YieldOp : OpenMP_Op<"yield",
     [Pure, ReturnLike, Terminator,
      ParentOneOf<["WsLoopOp", "ReductionDeclareOp",
-     "AtomicUpdateOp", "SimdLoopOp"]>]> {
+     "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "omp.yield" yields SSA values from the OpenMP dialect op region and
@@ -693,7 +781,7 @@ def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>;
 
 def ClauseTaskDepend : I32EnumAttr<
     "ClauseTaskDepend",
-    "task depend clause",
+    "depend clause in a target or task construct",
     [ClauseTaskDependIn, ClauseTaskDependOut, ClauseTaskDependInOut]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::omp";
@@ -1359,11 +1447,17 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
 
     The $map_types specifies the types and modifiers for the map clause.
 
-    TODO:  depend clause and map_type_modifier values iterator and mapper.
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO:  map_type_modifier values iterator and mapper.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1372,6 +1466,7 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `map_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1406,11 +1501,17 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
 
     The $map_types specifies the types and modifiers for the map clause.
 
-    TODO:  depend clause and map_type_modifier values iterator and mapper.
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO: map_type_modifier values iterator and mapper.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1419,6 +1520,7 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `map_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1457,11 +1559,16 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
     during verification to make sure the restrictions for target update are
     respected.
 
-    TODO: depend clause
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<OpenMP_PointerLikeType>:$map_operands);
 
@@ -1470,6 +1577,7 @@ def Target_UpdateDataOp: OpenMP_Op<"target_update_data",
     | `device` `(` $device `:` type($device) `)`
     | `nowait` $nowait
     | `motion_entries` `(` $map_operands `:` type($map_operands) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) attr-dict
    }];
 
@@ -1499,13 +1607,19 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, MapClauseOwningOpInterface
     The optional $nowait elliminates the implicit barrier so the parent task can make progress
     even if the target task is not yet completed.
 
-    TODO:  is_device_ptr, depend, defaultmap, in_reduction
+    The `depends` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular target task in relation to
+    other tasks.
+
+    TODO:  is_device_ptr, defaultmap, in_reduction
 
   }];
 
   let arguments = (ins Optional<I1>:$if_expr,
                        Optional<AnyInteger>:$device,
                        Optional<AnyInteger>:$thread_limit,
+                       OptionalAttr<TaskDependArrayAttr>:$depends,
+                       Variadic<OpenMP_PointerLikeType>:$depend_vars,
                        UnitAttr:$nowait,
                        Variadic<AnyType>:$map_operands);
 
@@ -1517,6 +1631,7 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, MapClauseOwningOpInterface
     | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
     | `nowait` $nowait
     | `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) $region attr-dict
   }];
 
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
index e91f9e4469ab..690cd146c606 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
@@ -30,6 +30,7 @@ namespace scf {
 class IfOp;
 class ForOp;
 class ParallelOp;
+class WhileOp;
 
 /// Fuses all adjacent scf.parallel operations with identical bounds and step
 /// into one scf.parallel operations. Uses a naive aliasing and dependency
@@ -181,6 +182,46 @@ FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
                                  const PipeliningOption &options,
                                  bool *modifiedIR = nullptr);
 
+/// Create zero-trip-check around a `while` op and return the new loop op in the
+/// check. The while loop is rotated to avoid evaluating the condition twice
+///
+/// By default the check won't be created for do-while loop as it is not
+/// required. `forceCreateCheck` can force the creation.
+///
+/// It turns:
+///
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+///  into:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.if %pre_cond -> i64 {
+///     %res = scf.while (%arg1 = %va0) : (i64) -> i64 {
+///       %next = .., %arg1 : i32
+///       %val = .., %next : i64
+///       %cond = arith.cmpi .., %next : i32
+///       scf.condition(%cond) %val : i64
+///     } do {
+///     ^bb0(%arg2: i64):
+///       %scf.yield %arg2 : i32
+///     }
+///     scf.yield %res : i64
+///   } else {
+///     scf.yield %pre_val : i64
+///   }
+FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp,
+                                                RewriterBase &rewriter,
+                                                bool forceCreateCheck = false);
+
 } // namespace scf
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 86c52bfc651e..e940d203be9e 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -154,9 +154,10 @@ enum class Action : uint32_t {
 
 /// This enum defines all the sparse representations supportable by
 /// the SparseTensor dialect. We use a lightweight encoding to encode
-/// both the "format" per se (dense, compressed, singleton, loose_compressed,
-/// two-out-of-four) as well as the "properties" (ordered, unique). The
-/// encoding is chosen for performance of the runtime library, and thus may
+/// the "format" per se (dense, compressed, singleton, loose_compressed,
+/// n-out-of-m), the "properties" (ordered, unique) as well as n and m when
+/// the format is NOutOfM.
+/// The encoding is chosen for performance of the runtime library, and thus may
 /// change in future versions; consequently, client code should use the
 /// predicate functions defined below, rather than relying on knowledge
 /// about the particular binary encoding.
@@ -165,41 +166,75 @@ enum class Action : uint32_t {
 /// where we need to store an undefined or indeterminate `LevelType`.
 /// It should not be used externally, since it does not indicate an
 /// actual/representable format.
+///
+/// Bit manipulations for LevelType:
+///
+/// | 8-bit n | 8-bit m | 16-bit LevelFormat | 16-bit LevelProperty |
+///
 enum class LevelType : uint64_t {
-  Undef = 0,                // 0b00000_00
-  Dense = 4,                // 0b00001_00
-  Compressed = 8,           // 0b00010_00
-  CompressedNu = 9,         // 0b00010_01
-  CompressedNo = 10,        // 0b00010_10
-  CompressedNuNo = 11,      // 0b00010_11
-  Singleton = 16,           // 0b00100_00
-  SingletonNu = 17,         // 0b00100_01
-  SingletonNo = 18,         // 0b00100_10
-  SingletonNuNo = 19,       // 0b00100_11
-  LooseCompressed = 32,     // 0b01000_00
-  LooseCompressedNu = 33,   // 0b01000_01
-  LooseCompressedNo = 34,   // 0b01000_10
-  LooseCompressedNuNo = 35, // 0b01000_11
-  TwoOutOfFour = 64,        // 0b10000_00
+  Undef = 0x000000000000,
+  Dense = 0x000000010000,
+  Compressed = 0x000000020000,
+  CompressedNu = 0x000000020001,
+  CompressedNo = 0x000000020002,
+  CompressedNuNo = 0x000000020003,
+  Singleton = 0x000000040000,
+  SingletonNu = 0x000000040001,
+  SingletonNo = 0x000000040002,
+  SingletonNuNo = 0x000000040003,
+  LooseCompressed = 0x000000080000,
+  LooseCompressedNu = 0x000000080001,
+  LooseCompressedNo = 0x000000080002,
+  LooseCompressedNuNo = 0x000000080003,
+  NOutOfM = 0x000000100000,
 };
 
 /// This enum defines all supported storage format without the level properties.
 enum class LevelFormat : uint64_t {
-  Dense = 4,            // 0b00001_00
-  Compressed = 8,       // 0b00010_00
-  Singleton = 16,       // 0b00100_00
-  LooseCompressed = 32, // 0b01000_00
-  TwoOutOfFour = 64,    // 0b10000_00
+  Dense = 0x00010000,
+  Compressed = 0x00020000,
+  Singleton = 0x00040000,
+  LooseCompressed = 0x00080000,
+  NOutOfM = 0x00100000,
 };
 
 /// This enum defines all the nondefault properties for storage formats.
 enum class LevelPropertyNondefault : uint64_t {
-  Nonunique = 1,  // 0b00000_01
-  Nonordered = 2, // 0b00000_10
+  Nonunique = 0x0001,
+  Nonordered = 0x0002,
 };
 
+/// Get N of NOutOfM level type.
+constexpr uint64_t getN(LevelType lt) {
+  return (static_cast<uint64_t>(lt) >> 32) & 0xff;
+}
+
+/// Get M of NOutOfM level type.
+constexpr uint64_t getM(LevelType lt) {
+  return (static_cast<uint64_t>(lt) >> 40) & 0xff;
+}
+
+/// Convert N of NOutOfM level type to the stored bits.
+constexpr uint64_t nToBits(uint64_t n) { return n << 32; }
+
+/// Convert M of NOutOfM level type to the stored bits.
+constexpr uint64_t mToBits(uint64_t m) { return m << 40; }
+
+/// Check if the `LevelType` is NOutOfM (regardless of
+/// properties and block sizes).
+constexpr bool isNOutOfMLT(LevelType lt) {
+  return ((static_cast<uint64_t>(lt) & 0x100000) ==
+          static_cast<uint64_t>(LevelType::NOutOfM));
+}
+
+/// Check if the `LevelType` is NOutOfM with the correct block sizes.
+constexpr bool isValidNOutOfMLT(LevelType lt, uint64_t n, uint64_t m) {
+  return isNOutOfMLT(lt) && getN(lt) == n && getM(lt) == m;
+}
+
 /// Returns string representation of the given dimension level type.
-constexpr const char *toMLIRString(LevelType lt) {
+constexpr const char *toMLIRString(LevelType lvlType) {
+  auto lt = static_cast<LevelType>(static_cast<uint64_t>(lvlType) & 0xffffffff);
   switch (lt) {
   case LevelType::Undef:
     return "undef";
@@ -229,21 +264,22 @@ constexpr const char *toMLIRString(LevelType lt) {
     return "loose_compressed(nonordered)";
   case LevelType::LooseCompressedNuNo:
     return "loose_compressed(nonunique, nonordered)";
-  case LevelType::TwoOutOfFour:
-    return "block2_4";
+  case LevelType::NOutOfM:
+    return "structured";
   }
   return "";
 }
 
 /// Check that the `LevelType` contains a valid (possibly undefined) value.
 constexpr bool isValidLT(LevelType lt) {
-  const uint64_t formatBits = static_cast<uint64_t>(lt) >> 2;
-  const uint64_t propertyBits = static_cast<uint64_t>(lt) & 3;
-  // If undefined or dense, then must be unique and ordered.
+  const uint64_t formatBits = static_cast<uint64_t>(lt) & 0xffff0000;
+  const uint64_t propertyBits = static_cast<uint64_t>(lt) & 0xffff;
+  // If undefined/dense/NOutOfM, then must be unique and ordered.
   // Otherwise, the format must be one of the known ones.
-  return (formatBits <= 1 || formatBits == 16)
+  return (formatBits <= 0x10000 || formatBits == 0x100000)
              ? (propertyBits == 0)
-             : (formatBits == 2 || formatBits == 4 || formatBits == 8);
+             : (formatBits == 0x20000 || formatBits == 0x40000 ||
+                formatBits == 0x80000);
 }
 
 /// Check if the `LevelType` is the special undefined value.
@@ -251,34 +287,28 @@ constexpr bool isUndefLT(LevelType lt) { return lt == LevelType::Undef; }
 
 /// Check if the `LevelType` is dense (regardless of properties).
 constexpr bool isDenseLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Dense);
 }
 
 /// Check if the `LevelType` is compressed (regardless of properties).
 constexpr bool isCompressedLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Compressed);
 }
 
 /// Check if the `LevelType` is singleton (regardless of properties).
 constexpr bool isSingletonLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::Singleton);
 }
 
 /// Check if the `LevelType` is loose compressed (regardless of properties).
 constexpr bool isLooseCompressedLT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
+  return (static_cast<uint64_t>(lt) & ~0xffff) ==
          static_cast<uint64_t>(LevelType::LooseCompressed);
 }
 
-/// Check if the `LevelType` is 2OutOf4 (regardless of properties).
-constexpr bool is2OutOf4LT(LevelType lt) {
-  return (static_cast<uint64_t>(lt) & ~3) ==
-         static_cast<uint64_t>(LevelType::TwoOutOfFour);
-}
-
 /// Check if the `LevelType` needs positions array.
 constexpr bool isWithPosLT(LevelType lt) {
   return isCompressedLT(lt) || isLooseCompressedLT(lt);
@@ -287,17 +317,19 @@ constexpr bool isWithPosLT(LevelType lt) {
 /// Check if the `LevelType` needs coordinates array.
 constexpr bool isWithCrdLT(LevelType lt) {
   return isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-         is2OutOf4LT(lt);
+         isNOutOfMLT(lt);
 }
 
 /// Check if the `LevelType` is ordered (regardless of storage format).
 constexpr bool isOrderedLT(LevelType lt) {
   return !(static_cast<uint64_t>(lt) & 2);
+  return !(static_cast<uint64_t>(lt) & 2);
 }
 
 /// Check if the `LevelType` is unique (regardless of storage format).
 constexpr bool isUniqueLT(LevelType lt) {
   return !(static_cast<uint64_t>(lt) & 1);
+  return !(static_cast<uint64_t>(lt) & 1);
 }
 
 /// Convert a LevelType to its corresponding LevelFormat.
@@ -305,21 +337,25 @@ constexpr bool isUniqueLT(LevelType lt) {
 constexpr std::optional<LevelFormat> getLevelFormat(LevelType lt) {
   if (lt == LevelType::Undef)
     return std::nullopt;
-  return static_cast<LevelFormat>(static_cast<uint64_t>(lt) & ~3);
+  return static_cast<LevelFormat>(static_cast<uint64_t>(lt) & 0xffff0000);
 }
 
 /// Convert a LevelFormat to its corresponding LevelType with the given
 /// properties. Returns std::nullopt when the properties are not applicable
 /// for the input level format.
 constexpr std::optional<LevelType> buildLevelType(LevelFormat lf, bool ordered,
-                                                  bool unique) {
-  auto lt = static_cast<LevelType>(static_cast<uint64_t>(lf) |
-                                   (ordered ? 0 : 2) | (unique ? 0 : 1));
+                                                  bool unique, uint64_t n = 0,
+                                                  uint64_t m = 0) {
+  uint64_t newN = n << 32;
+  uint64_t newM = m << 40;
+  auto lt =
+      static_cast<LevelType>(static_cast<uint64_t>(lf) | (ordered ? 0 : 2) |
+                             (unique ? 0 : 1) | newN | newM);
   return isValidLT(lt) ? std::optional(lt) : std::nullopt;
 }
 
 //
-// Ensure the above methods work as indended.
+// Ensure the above methods work as intended.
 //
 
 static_assert(
@@ -341,7 +377,7 @@ static_assert(
          LevelFormat::LooseCompressed &&
      *getLevelFormat(LevelType::LooseCompressedNuNo) ==
          LevelFormat::LooseCompressed &&
-     *getLevelFormat(LevelType::TwoOutOfFour) == LevelFormat::TwoOutOfFour),
+     *getLevelFormat(LevelType::NOutOfM) == LevelFormat::NOutOfM),
     "getLevelFormat conversion is broken");
 
 static_assert(
@@ -373,14 +409,29 @@ static_assert(
          LevelType::LooseCompressedNo &&
      *buildLevelType(LevelFormat::LooseCompressed, false, false) ==
          LevelType::LooseCompressedNuNo &&
-     buildLevelType(LevelFormat::TwoOutOfFour, false, true) == std::nullopt &&
-     buildLevelType(LevelFormat::TwoOutOfFour, true, false) == std::nullopt &&
-     buildLevelType(LevelFormat::TwoOutOfFour, false, false) == std::nullopt &&
-     *buildLevelType(LevelFormat::TwoOutOfFour, true, true) ==
-         LevelType::TwoOutOfFour),
+     buildLevelType(LevelFormat::NOutOfM, false, true) == std::nullopt &&
+     buildLevelType(LevelFormat::NOutOfM, true, false) == std::nullopt &&
+     buildLevelType(LevelFormat::NOutOfM, false, false) == std::nullopt &&
+     *buildLevelType(LevelFormat::NOutOfM, true, true) == LevelType::NOutOfM),
     "buildLevelType conversion is broken");
 
 static_assert(
+    (getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 2 &&
+     getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4)) == 4 &&
+     getN(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 8 &&
+     getM(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10)) == 10),
+    "getN/M conversion is broken");
+
+static_assert(
+    (isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 2, 4),
+                      2, 4) &&
+     isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 8, 10),
+                      8, 10) &&
+     !isValidNOutOfMLT(*buildLevelType(LevelFormat::NOutOfM, true, true, 3, 4),
+                       2, 4)),
+    "isValidNOutOfMLT definition is broken");
+
+static_assert(
     (isValidLT(LevelType::Undef) && isValidLT(LevelType::Dense) &&
      isValidLT(LevelType::Compressed) && isValidLT(LevelType::CompressedNu) &&
      isValidLT(LevelType::CompressedNo) &&
@@ -391,7 +442,7 @@ static_assert(
      isValidLT(LevelType::LooseCompressedNu) &&
      isValidLT(LevelType::LooseCompressedNo) &&
      isValidLT(LevelType::LooseCompressedNuNo) &&
-     isValidLT(LevelType::TwoOutOfFour)),
+     isValidLT(LevelType::NOutOfM)),
     "isValidLT definition is broken");
 
 static_assert((isDenseLT(LevelType::Dense) &&
@@ -407,7 +458,7 @@ static_assert((isDenseLT(LevelType::Dense) &&
                !isDenseLT(LevelType::LooseCompressedNu) &&
                !isDenseLT(LevelType::LooseCompressedNo) &&
                !isDenseLT(LevelType::LooseCompressedNuNo) &&
-               !isDenseLT(LevelType::TwoOutOfFour)),
+               !isDenseLT(LevelType::NOutOfM)),
               "isDenseLT definition is broken");
 
 static_assert((!isCompressedLT(LevelType::Dense) &&
@@ -423,7 +474,7 @@ static_assert((!isCompressedLT(LevelType::Dense) &&
                !isCompressedLT(LevelType::LooseCompressedNu) &&
                !isCompressedLT(LevelType::LooseCompressedNo) &&
                !isCompressedLT(LevelType::LooseCompressedNuNo) &&
-               !isCompressedLT(LevelType::TwoOutOfFour)),
+               !isCompressedLT(LevelType::NOutOfM)),
               "isCompressedLT definition is broken");
 
 static_assert((!isSingletonLT(LevelType::Dense) &&
@@ -439,7 +490,7 @@ static_assert((!isSingletonLT(LevelType::Dense) &&
                !isSingletonLT(LevelType::LooseCompressedNu) &&
                !isSingletonLT(LevelType::LooseCompressedNo) &&
                !isSingletonLT(LevelType::LooseCompressedNuNo) &&
-               !isSingletonLT(LevelType::TwoOutOfFour)),
+               !isSingletonLT(LevelType::NOutOfM)),
               "isSingletonLT definition is broken");
 
 static_assert((!isLooseCompressedLT(LevelType::Dense) &&
@@ -455,24 +506,24 @@ static_assert((!isLooseCompressedLT(LevelType::Dense) &&
                isLooseCompressedLT(LevelType::LooseCompressedNu) &&
                isLooseCompressedLT(LevelType::LooseCompressedNo) &&
                isLooseCompressedLT(LevelType::LooseCompressedNuNo) &&
-               !isLooseCompressedLT(LevelType::TwoOutOfFour)),
+               !isLooseCompressedLT(LevelType::NOutOfM)),
               "isLooseCompressedLT definition is broken");
 
-static_assert((!is2OutOf4LT(LevelType::Dense) &&
-               !is2OutOf4LT(LevelType::Compressed) &&
-               !is2OutOf4LT(LevelType::CompressedNu) &&
-               !is2OutOf4LT(LevelType::CompressedNo) &&
-               !is2OutOf4LT(LevelType::CompressedNuNo) &&
-               !is2OutOf4LT(LevelType::Singleton) &&
-               !is2OutOf4LT(LevelType::SingletonNu) &&
-               !is2OutOf4LT(LevelType::SingletonNo) &&
-               !is2OutOf4LT(LevelType::SingletonNuNo) &&
-               !is2OutOf4LT(LevelType::LooseCompressed) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNu) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNo) &&
-               !is2OutOf4LT(LevelType::LooseCompressedNuNo) &&
-               is2OutOf4LT(LevelType::TwoOutOfFour)),
-              "is2OutOf4LT definition is broken");
+static_assert((!isNOutOfMLT(LevelType::Dense) &&
+               !isNOutOfMLT(LevelType::Compressed) &&
+               !isNOutOfMLT(LevelType::CompressedNu) &&
+               !isNOutOfMLT(LevelType::CompressedNo) &&
+               !isNOutOfMLT(LevelType::CompressedNuNo) &&
+               !isNOutOfMLT(LevelType::Singleton) &&
+               !isNOutOfMLT(LevelType::SingletonNu) &&
+               !isNOutOfMLT(LevelType::SingletonNo) &&
+               !isNOutOfMLT(LevelType::SingletonNuNo) &&
+               !isNOutOfMLT(LevelType::LooseCompressed) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNu) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNo) &&
+               !isNOutOfMLT(LevelType::LooseCompressedNuNo) &&
+               isNOutOfMLT(LevelType::NOutOfM)),
+              "isNOutOfMLT definition is broken");
 
 static_assert((isOrderedLT(LevelType::Dense) &&
                isOrderedLT(LevelType::Compressed) &&
@@ -487,7 +538,7 @@ static_assert((isOrderedLT(LevelType::Dense) &&
                isOrderedLT(LevelType::LooseCompressedNu) &&
                !isOrderedLT(LevelType::LooseCompressedNo) &&
                !isOrderedLT(LevelType::LooseCompressedNuNo) &&
-               isOrderedLT(LevelType::TwoOutOfFour)),
+               isOrderedLT(LevelType::NOutOfM)),
               "isOrderedLT definition is broken");
 
 static_assert((isUniqueLT(LevelType::Dense) &&
@@ -503,7 +554,7 @@ static_assert((isUniqueLT(LevelType::Dense) &&
                !isUniqueLT(LevelType::LooseCompressedNu) &&
                isUniqueLT(LevelType::LooseCompressedNo) &&
                !isUniqueLT(LevelType::LooseCompressedNuNo) &&
-               isUniqueLT(LevelType::TwoOutOfFour)),
+               isUniqueLT(LevelType::NOutOfM)),
               "isUniqueLT definition is broken");
 
 /// Bit manipulations for affine encoding.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 12c1068ae1f5..5b3b971f9a7f 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -145,7 +145,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     - **compressed** : only nonzeros along this level are stored
     - **loose_compressed** : as compressed, but allows for free space between regions
     - **singleton** : a variant of the compressed format, where coordinates have no siblings
-    - **block2_4** : the compression uses a 2:4 encoding per 1x4 block
+    - **structured[n, m]** : the compression uses a n:m encoding
+      (viz. n out of m consecutive elements are nonzero)
 
     For a compressed level, each position interval is represented in a compact
     way with a lowerbound `pos(i)` and an upperbound `pos(i+1) - 1`, which implies
@@ -374,7 +375,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     bool isCompressedLvl(::mlir::sparse_tensor::Level l) const { return isCompressedLT(getLvlType(l)); }
     bool isSingletonLvl(::mlir::sparse_tensor::Level l) const { return isSingletonLT(getLvlType(l)); }
     bool isLooseCompressedLvl(::mlir::sparse_tensor::Level l) const { return isLooseCompressedLT(getLvlType(l)); }
-    bool isTwoOutOfFourLvl(::mlir::sparse_tensor::Level l) const { return is2OutOf4LT(getLvlType(l)); }
+    bool isNOutOfMLvl(::mlir::sparse_tensor::Level l) const { return isNOutOfMLT(getLvlType(l)); }
     bool isOrderedLvl(::mlir::sparse_tensor::Level l) const { return isOrderedLT(getLvlType(l)); }
     bool isUniqueLvl(::mlir::sparse_tensor::Level l) const { return isUniqueLT(getLvlType(l)); }
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 4c98129744bc..4e2b85d35c1a 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -291,7 +291,7 @@ public:
     return isLooseCompressedLT(getLvlType(l));
   }
   bool isSingletonLvl(Level l) const { return isSingletonLT(getLvlType(l)); }
-  bool is2OutOf4Lvl(Level l) const { return is2OutOf4LT(getLvlType(l)); }
+  bool isNOutOfMLvl(Level l) const { return isNOutOfMLT(getLvlType(l)); }
   bool isOrderedLvl(Level l) const { return isOrderedLT(getLvlType(l)); }
   bool isUniqueLvl(Level l) const { return isUniqueLT(getLvlType(l)); }
   bool isWithPos(Level l) const { return isWithPosLT(getLvlType(l)); }
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 4a34bb2e003e..490ef3071af1 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -510,7 +510,7 @@ public:
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
       return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || is2OutOf4LT(lt);
+             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
     }
     return false;
   }
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index c3436fd68242..1d6eb24156e3 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -75,6 +75,10 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "Select the operation with 'transform.target_tag' attribute having "
            "the given value as payload IR root. If empty select the pass "
            "anchor operation as the payload IR root.">,
+    ListOption<"debugBindTrailingArgs", "debug-bind-trailing-args",
+               "std::string",
+               "Binds trailing arguments of the entry point to the payload "
+               "operations with specified names.">,
     Option<"disableExpensiveChecks", "disable-expensive-checks", "bool",
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
index 1737d72838e9..738e0c533c6b 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
@@ -84,6 +84,11 @@ LogicalResult applyTransformNamedSequence(Operation *payload,
                                           ModuleOp transformModule,
                                           const TransformOptions &options);
 
+LogicalResult applyTransformNamedSequence(RaggedArray<MappedValue> bindings,
+                                          TransformOpInterface transformRoot,
+                                          ModuleOp transformModule,
+                                          const TransformOptions &options);
+
 } // namespace transform
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
index 0ee23914fa4e..3d4083bdd073 100644
--- a/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
+++ b/mlir/include/mlir/Dialect/Transform/Utils/RaggedArray.h
@@ -150,6 +150,9 @@ public:
     slices.resize(slices.size() + num, std::pair<size_t, size_t>(-1, 0));
   }
 
+  /// Removes the first subarray in-place. Invalidates iterators to all rows.
+  void removeFront() { slices.erase(slices.begin()); }
+
 private:
   /// Appends the given elements to the storage and returns an ArrayRef
   /// pointing to them in the storage.
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 134c5569fbb2..929a2a7d3964 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -121,7 +121,9 @@ public:
   }
 
   bool layout(MapList l) {
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, ctx);
+    };
     return maps == infer(l);
   }
 
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index bc08f8d07fb0..6d50b0654bc5 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -478,6 +478,69 @@ def Vector_ShuffleOp :
   let hasCanonicalizer = 1;
 }
 
+def Vector_InterleaveOp :
+  Vector_Op<"interleave", [Pure,
+    AllTypesMatch<["lhs", "rhs"]>,
+    TypesMatchWith<
+    "type of 'result' is double the width of the inputs",
+    "lhs", "result",
+    [{
+      [&]() -> ::mlir::VectorType {
+        auto vectorType = ::llvm::cast<mlir::VectorType>($_self);
+        ::mlir::VectorType::Builder builder(vectorType);
+        if (vectorType.getRank() == 0) {
+          static constexpr int64_t v2xty_shape[] = { 2 };
+          return builder.setShape(v2xty_shape);
+        }
+        auto lastDim = vectorType.getRank() - 1;
+        return builder.setDim(lastDim, vectorType.getDimSize(lastDim) * 2);
+      }()
+    }]>]> {
+  let summary = "constructs a vector by interleaving two input vectors";
+  let description = [{
+    The interleave operation constructs a new vector by interleaving the
+    elements from the trailing (or final) dimension of two input vectors,
+    returning a new vector where the trailing dimension is twice the size.
+
+    Note that for the n-D case this differs from the interleaving possible with
+    `vector.shuffle`, which would only operate on the leading dimension.
+
+    Another key difference is this operation supports scalable vectors, though
+    currently a general LLVM lowering is limited to the case where only the
+    trailing dimension is scalable.
+
+    Example:
+    ```mlir
+    %0 = vector.interleave %a, %b
+               : vector<[4]xi32>     ; yields vector<[8]xi32>
+    %1 = vector.interleave %c, %d
+               : vector<8xi8>        ; yields vector<16xi8>
+    %2 = vector.interleave %e, %f
+               : vector<f16>         ; yields vector<2xf16>
+    %3 = vector.interleave %g, %h
+               : vector<2x4x[2]xf64> ; yields vector<2x4x[4]xf64>
+    %4 = vector.interleave %i, %j
+               : vector<6x3xf32>     ; yields vector<6x6xf32>
+    ```
+  }];
+
+  let arguments = (ins AnyVectorOfAnyRank:$lhs, AnyVectorOfAnyRank:$rhs);
+  let results = (outs AnyVector:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs  attr-dict `:` type($lhs)
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return ::llvm::cast<VectorType>(getLhs().getType());
+    }
+    VectorType getResultVectorType() {
+      return ::llvm::cast<VectorType>(getResult().getType());
+    }
+  }];
+}
+
 def Vector_ExtractElementOp :
   Vector_Op<"extractelement", [Pure,
      TypesMatchWith<"result type matches element type of vector operand",
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index f5941d32e683..7c943f07066c 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -20,7 +20,9 @@
 #include "mlir/Dialect/Vector/Transforms/VectorTransformsEnums.h.inc"
 
 namespace mlir {
+class ConversionTarget;
 class RewritePatternSet;
+class TypeConverter;
 
 namespace arith {
 class AndIOp;
@@ -375,6 +377,13 @@ void populateVectorNarrowTypeRewritePatterns(RewritePatternSet &patterns,
 void populateVectorTransposeNarrowTypeRewritePatterns(
     RewritePatternSet &patterns, PatternBenefit benefit = 1);
 
+/// Populates patterns for ND vectors (N >= 2) linearization and sets up the
+/// provided ConversionTarget with the appropriate legality configuration for
+/// the ops to get converted properly.
+void populateVectorLinearizeTypeConversionsAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target);
+
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 01c5f2382ffe..14182172f4f6 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -123,8 +123,8 @@ public:
   /// Safely checks if the level uses singleton storage.
   bool isSingletonLvl(uint64_t l) const { return isSingletonLT(getLvlType(l)); }
 
-  /// Safely checks if the level uses 2 out of 4 storage.
-  bool is2OutOf4Lvl(uint64_t l) const { return is2OutOf4LT(getLvlType(l)); }
+  /// Safely checks if the level uses n out of m storage.
+  bool isNOutOfMLvl(uint64_t l) const { return isNOutOfMLT(getLvlType(l)); }
 
   /// Safely checks if the level is ordered.
   bool isOrderedLvl(uint64_t l) const { return isOrderedLT(getLvlType(l)); }
@@ -450,7 +450,7 @@ private:
   void appendCrd(uint64_t lvl, uint64_t full, uint64_t crd) {
     if (!isDenseLvl(lvl)) {
       assert(isCompressedLvl(lvl) || isLooseCompressedLvl(lvl) ||
-             isSingletonLvl(lvl) || is2OutOf4Lvl(lvl));
+             isSingletonLvl(lvl) || isNOutOfMLvl(lvl));
       coordinates[lvl].push_back(detail::checkOverflowCast<C>(crd));
     } else { // Dense level.
       assert(crd >= full && "Coordinate was already filled");
@@ -473,7 +473,7 @@ private:
       return positions[l][parentSz];
     if (isLooseCompressedLvl(l))
       return positions[l][2 * parentSz - 1];
-    if (isSingletonLvl(l) || is2OutOf4Lvl(l))
+    if (isSingletonLvl(l) || isNOutOfMLvl(l))
       return parentSz; // new size same as the parent
     assert(isDenseLvl(l));
     return parentSz * getLvlSize(l);
@@ -527,7 +527,7 @@ private:
       uint64_t pos = coordinates[l].size();
       positions[l].insert(positions[l].end(), 2 * count,
                           detail::checkOverflowCast<P>(pos));
-    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
+    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
       return; // Nothing to finalize.
     } else {  // Dense dimension.
       assert(isDenseLvl(l));
@@ -624,7 +624,7 @@ private:
         lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
         toCOO(pos, l + 1, dimCoords);
       }
-    } else if (isSingletonLvl(l) || is2OutOf4Lvl(l)) {
+    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
       assert(parentPos < coordinates[l].size());
       lvlCursor[l] = static_cast<uint64_t>(coordinates[l][parentPos]);
       toCOO(parentPos, l + 1, dimCoords);
@@ -721,8 +721,8 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     } else if (isSingletonLvl(l)) {
       coordinates[l].reserve(sz);
       sz = 1;
-    } else if (is2OutOf4Lvl(l)) {
-      assert(l == lvlRank - 1 && "unexpected 2:4 usage");
+    } else if (isNOutOfMLvl(l)) {
+      assert(l == lvlRank - 1 && "unexpected n:m usage");
       sz = detail::checkedMul(sz, lvlSizes[l]) / 2;
       coordinates[l].reserve(sz);
       values.reserve(sz);
@@ -791,8 +791,8 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       }
     } else if (isSingletonLvl(l)) {
       assert(0 && "general singleton not supported yet");
-    } else if (is2OutOf4Lvl(l)) {
-      assert(0 && "2Out4 not supported yet");
+    } else if (isNOutOfMLvl(l)) {
+      assert(0 && "n ouf of m not supported yet");
     } else {
       assert(isDenseLvl(l));
     }
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index cd751af5bb25..cce141253989 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -122,9 +122,11 @@ public:
   /// `exprs.size()`, as many dims as the largest dim in `exprs` and as many
   /// symbols as the largest symbol in `exprs`.
   static SmallVector<AffineMap, 4>
-  inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList);
+  inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList,
+                    MLIRContext *context);
   static SmallVector<AffineMap, 4>
-  inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList);
+  inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList,
+                    MLIRContext *context);
 
   MLIRContext *getContext() const;
 
diff --git a/mlir/include/mlir/Interfaces/CallInterfaces.td b/mlir/include/mlir/Interfaces/CallInterfaces.td
index 3e9c002b22ed..752de74e6e4d 100644
--- a/mlir/include/mlir/Interfaces/CallInterfaces.td
+++ b/mlir/include/mlir/Interfaces/CallInterfaces.td
@@ -68,7 +68,7 @@ def CallOpInterface : OpInterface<"CallOpInterface"> {
     /// `symbolTable` is an optional parameter that will allow for using a
     /// cached symbol table for symbol lookups instead of performing an O(N)
     /// scan.
-    Operation *resolveCallable(SymbolTableCollection *symbolTable = nullptr);
+    ::mlir::Operation *resolveCallable(::mlir::SymbolTableCollection *symbolTable = nullptr);
   }];
 }
 
diff --git a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
index 19991a6f89d8..4a8ee06f5dcc 100644
--- a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
+++ b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
 #define MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectInterface.h"
 #include "mlir/Support/LogicalResult.h"
@@ -25,6 +26,7 @@ class IRBuilderBase;
 namespace mlir {
 namespace LLVM {
 class ModuleTranslation;
+class LLVMFuncOp;
 } // namespace LLVM
 
 /// Base class for dialect interfaces providing translation to LLVM IR.
@@ -58,6 +60,16 @@ public:
                  LLVM::ModuleTranslation &moduleTranslation) const {
     return success();
   }
+
+  /// Hook for derived dialect interface to translate or act on a derived
+  /// dialect attribute that appears on a function parameter. This gets called
+  /// after the function operation has been translated.
+  virtual LogicalResult
+  convertParameterAttr(LLVM::LLVMFuncOp function, int argIdx,
+                       NamedAttribute attr,
+                       LLVM::ModuleTranslation &moduleTranslation) const {
+    return success();
+  }
 };
 
 /// Interface collection for translation to LLVM IR, dispatches to a concrete
@@ -90,6 +102,22 @@ public:
     }
     return success();
   }
+
+  /// Acts on the given function operation using the interface implemented by
+  /// the dialect of one of the function parameter attributes.
+  virtual LogicalResult
+  convertParameterAttr(LLVM::LLVMFuncOp function, int argIdx,
+                       NamedAttribute attribute,
+                       LLVM::ModuleTranslation &moduleTranslation) const {
+    if (const LLVMTranslationDialectInterface *iface =
+            getInterfaceFor(attribute.getNameDialect())) {
+      return iface->convertParameterAttr(function, argIdx, attribute,
+                                         moduleTranslation);
+    }
+    function.emitWarning("Unhandled parameter attribute '" +
+                         attribute.getName().str() + "'");
+    return success();
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index d6b03aca28d2..fb4392eb223c 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -327,7 +327,9 @@ private:
                            ArrayRef<llvm::Instruction *> instructions);
 
   /// Translates parameter attributes and adds them to the returned AttrBuilder.
-  llvm::AttrBuilder convertParameterAttrs(DictionaryAttr paramAttrs);
+  /// Returns failure if any of the translations failed.
+  FailureOr<llvm::AttrBuilder>
+  convertParameterAttrs(LLVMFuncOp func, int argIdx, DictionaryAttr paramAttrs);
 
   /// Original and translated module.
   Operation *mlirModule;
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index b1ec1fe4ecd5..091131651bbf 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -604,6 +604,29 @@ private:
   using ConversionPattern::matchAndRewrite;
 };
 
+/// OpTraitConversionPattern is a wrapper around ConversionPattern that allows
+/// for matching and rewriting against instances of an operation that possess a
+/// given trait.
+template <template <typename> class TraitType>
+class OpTraitConversionPattern : public ConversionPattern {
+public:
+  OpTraitConversionPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : ConversionPattern(Pattern::MatchTraitOpTypeTag(),
+                          TypeID::get<TraitType>(), benefit, context) {}
+  OpTraitConversionPattern(const TypeConverter &typeConverter,
+                           MLIRContext *context, PatternBenefit benefit = 1)
+      : ConversionPattern(typeConverter, Pattern::MatchTraitOpTypeTag(),
+                          TypeID::get<TraitType>(), benefit, context) {}
+};
+
+/// Generic utility to convert op result types according to type converter
+/// without knowing exact op type.
+/// Clones existing op with new result types and returns it.
+FailureOr<Operation *>
+convertOpResultTypes(Operation *op, ValueRange operands,
+                     const TypeConverter &converter,
+                     ConversionPatternRewriter &rewriter);
+
 /// Add a pattern to the given pattern list to convert the signature of a
 /// FunctionOpInterface op with the given type converter. This only supports
 /// ops which use FunctionType to represent their type.
diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
index 698367a1adda..74f4d2413a69 100644
--- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
+++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
@@ -25,7 +25,7 @@ using namespace mlir::python::adaptors;
 static void populateDialectSparseTensorSubmodule(const py::module &m) {
   py::enum_<MlirBaseSparseTensorLevelType>(m, "LevelType", py::module_local())
       .value("dense", MLIR_SPARSE_TENSOR_LEVEL_DENSE)
-      .value("compressed24", MLIR_SPARSE_TENSOR_LEVEL_TWO_OUT_OF_FOUR)
+      .value("n_out_of_m", MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M)
       .value("compressed", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED)
       .value("compressed_nu", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU)
       .value("compressed_no", MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO)
@@ -60,6 +60,15 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
           py::arg("lvl_to_dim"), py::arg("pos_width"), py::arg("crd_width"),
           py::arg("context") = py::none(),
           "Gets a sparse_tensor.encoding from parameters.")
+      .def_classmethod(
+          "build_level_type",
+          [](py::object cls, MlirBaseSparseTensorLevelType lvlType, unsigned n,
+             unsigned m) {
+            return mlirSparseTensorEncodingAttrBuildLvlType(lvlType, n, m);
+          },
+          py::arg("cls"), py::arg("lvl_type"), py::arg("n") = 0,
+          py::arg("m") = 0,
+          "Builds a sparse_tensor.encoding.level_type from parameters.")
       .def_property_readonly(
           "lvl_types",
           [](MlirAttribute self) {
@@ -89,7 +98,34 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
       .def_property_readonly("pos_width",
                              mlirSparseTensorEncodingAttrGetPosWidth)
       .def_property_readonly("crd_width",
-                             mlirSparseTensorEncodingAttrGetCrdWidth);
+                             mlirSparseTensorEncodingAttrGetCrdWidth)
+      .def_property_readonly(
+          "structured_n",
+          [](MlirAttribute self) -> unsigned {
+            const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+            return mlirSparseTensorEncodingAttrGetStructuredN(
+                mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1));
+          })
+      .def_property_readonly(
+          "structured_m",
+          [](MlirAttribute self) -> unsigned {
+            const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+            return mlirSparseTensorEncodingAttrGetStructuredM(
+                mlirSparseTensorEncodingAttrGetLvlType(self, lvlRank - 1));
+          })
+      .def_property_readonly("lvl_types_enum", [](MlirAttribute self) {
+        const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
+        std::vector<MlirBaseSparseTensorLevelType> ret;
+        ret.reserve(lvlRank);
+        for (int l = 0; l < lvlRank; l++) {
+          // Convert level type to 32 bits to ignore n and m for n_out_of_m
+          // format.
+          ret.push_back(
+              static_cast<MlirBaseSparseTensorLevelType>(static_cast<uint32_t>(
+                  mlirSparseTensorEncodingAttrGetLvlType(self, l))));
+        }
+        return ret;
+      });
 }
 
 PYBIND11_MODULE(_mlirDialectsSparseTensor, m) {
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 5412c3dec4b1..8a7951dc29fe 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -636,6 +636,13 @@ size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); }
 
 size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); }
 
+std::vector<PyOperation *> PyMlirContext::getLiveOperationObjects() {
+  std::vector<PyOperation *> liveObjects;
+  for (auto &entry : liveOperations)
+    liveObjects.push_back(entry.second.second);
+  return liveObjects;
+}
+
 size_t PyMlirContext::clearLiveOperations() {
   for (auto &op : liveOperations)
     op.second.second->setInvalid();
@@ -2546,6 +2553,8 @@ void mlir::python::populateIRCore(py::module &m) {
              return ref.releaseObject();
            })
       .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount)
+      .def("_get_live_operation_objects",
+           &PyMlirContext::getLiveOperationObjects)
       .def("_clear_live_operations", &PyMlirContext::clearLiveOperations)
       .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
       .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 79b7e0c96188..48f39c939340 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -201,6 +201,9 @@ public:
   /// Gets the count of live context objects. Used for testing.
   static size_t getLiveCount();
 
+  /// Get a list of Python objects which are still in the live context map.
+  std::vector<PyOperation *> getLiveOperationObjects();
+
   /// Gets the count of live operations associated with this context.
   /// Used for testing.
   size_t getLiveOperationCount();
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index e4534ad13238..4e1bd45863fd 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -20,25 +20,36 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor,
                                       mlir::sparse_tensor::SparseTensorDialect)
 
 // Ensure the C-API enums are int-castable to C++ equivalents.
-static_assert(static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_DENSE) ==
-                      static_cast<int>(LevelType::Dense) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) ==
-                      static_cast<int>(LevelType::Compressed) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) ==
-                      static_cast<int>(LevelType::CompressedNu) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) ==
-                      static_cast<int>(LevelType::CompressedNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) ==
-                      static_cast<int>(LevelType::CompressedNuNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) ==
-                      static_cast<int>(LevelType::Singleton) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) ==
-                      static_cast<int>(LevelType::SingletonNu) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) ==
-                      static_cast<int>(LevelType::SingletonNo) &&
-                  static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) ==
-                      static_cast<int>(LevelType::SingletonNuNo),
-              "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch");
+static_assert(
+    static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_DENSE) ==
+            static_cast<int>(LevelType::Dense) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED) ==
+            static_cast<int>(LevelType::Compressed) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU) ==
+            static_cast<int>(LevelType::CompressedNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NO) ==
+            static_cast<int>(LevelType::CompressedNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED_NU_NO) ==
+            static_cast<int>(LevelType::CompressedNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON) ==
+            static_cast<int>(LevelType::Singleton) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU) ==
+            static_cast<int>(LevelType::SingletonNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NO) ==
+            static_cast<int>(LevelType::SingletonNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_SINGLETON_NU_NO) ==
+            static_cast<int>(LevelType::SingletonNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED) ==
+            static_cast<int>(LevelType::LooseCompressed) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU) ==
+            static_cast<int>(LevelType::LooseCompressedNu) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NO) ==
+            static_cast<int>(LevelType::LooseCompressedNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED_NU_NO) ==
+            static_cast<int>(LevelType::LooseCompressedNuNo) &&
+        static_cast<int>(MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M) ==
+            static_cast<int>(LevelType::NOutOfM),
+    "MlirSparseTensorLevelType (C-API) and LevelType (C++) mismatch");
 
 bool mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr) {
   return isa<SparseTensorEncodingAttr>(unwrap(attr));
@@ -83,3 +94,21 @@ int mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr) {
 int mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr) {
   return cast<SparseTensorEncodingAttr>(unwrap(attr)).getCrdWidth();
 }
+
+MlirSparseTensorLevelType
+mlirSparseTensorEncodingAttrBuildLvlType(MlirBaseSparseTensorLevelType lvlType,
+                                         unsigned n, unsigned m) {
+  LevelType lt = static_cast<LevelType>(lvlType);
+  return static_cast<MlirSparseTensorLevelType>(*buildLevelType(
+      *getLevelFormat(lt), isOrderedLT(lt), isUniqueLT(lt), n, m));
+}
+
+unsigned
+mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType) {
+  return getN(static_cast<LevelType>(lvlType));
+}
+
+unsigned
+mlirSparseTensorEncodingAttrGetStructuredM(MlirSparseTensorLevelType lvlType) {
+  return getM(static_cast<LevelType>(lvlType));
+}
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 4c9dad9e2c17..cc315110f9be 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -26,29 +26,59 @@ namespace mlir {
 using namespace mlir;
 
 namespace {
+// The algorithm is listed in https://dl.acm.org/doi/pdf/10.1145/363717.363780.
 struct AbsOpConversion : public OpConversionPattern<complex::AbsOp> {
   using OpConversionPattern<complex::AbsOp>::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(complex::AbsOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    auto type = op.getType();
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
-    Value real =
-        rewriter.create<complex::ReOp>(loc, type, adaptor.getComplex());
-    Value imag =
-        rewriter.create<complex::ImOp>(loc, type, adaptor.getComplex());
-    Value realSqr =
-        rewriter.create<arith::MulFOp>(loc, real, real, fmf.getValue());
-    Value imagSqr =
-        rewriter.create<arith::MulFOp>(loc, imag, imag, fmf.getValue());
-    Value sqNorm =
-        rewriter.create<arith::AddFOp>(loc, realSqr, imagSqr, fmf.getValue());
-
-    rewriter.replaceOpWithNewOp<math::SqrtOp>(op, sqNorm);
+    Type elementType = op.getType();
+    Value arg = adaptor.getComplex();
+
+    Value zero =
+        b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
+    Value one = b.create<arith::ConstantOp>(elementType,
+                                            b.getFloatAttr(elementType, 1.0));
+
+    Value real = b.create<complex::ReOp>(elementType, arg);
+    Value imag = b.create<complex::ImOp>(elementType, arg);
+
+    Value realIsZero =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, zero);
+    Value imagIsZero =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag, zero);
+
+    // Real > Imag
+    Value imagDivReal = b.create<arith::DivFOp>(imag, real, fmf.getValue());
+    Value imagSq =
+        b.create<arith::MulFOp>(imagDivReal, imagDivReal, fmf.getValue());
+    Value imagSqPlusOne = b.create<arith::AddFOp>(imagSq, one, fmf.getValue());
+    Value imagSqrt = b.create<math::SqrtOp>(imagSqPlusOne, fmf.getValue());
+    Value realAbs = b.create<math::AbsFOp>(real, fmf.getValue());
+    Value absImag = b.create<arith::MulFOp>(imagSqrt, realAbs, fmf.getValue());
+
+    // Real <= Imag
+    Value realDivImag = b.create<arith::DivFOp>(real, imag, fmf.getValue());
+    Value realSq =
+        b.create<arith::MulFOp>(realDivImag, realDivImag, fmf.getValue());
+    Value realSqPlusOne = b.create<arith::AddFOp>(realSq, one, fmf.getValue());
+    Value realSqrt = b.create<math::SqrtOp>(realSqPlusOne, fmf.getValue());
+    Value imagAbs = b.create<math::AbsFOp>(imag, fmf.getValue());
+    Value absReal = b.create<arith::MulFOp>(realSqrt, imagAbs, fmf.getValue());
+
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(
+        op, realIsZero, imagAbs,
+        b.create<arith::SelectOp>(
+            imagIsZero, realAbs,
+            b.create<arith::SelectOp>(
+                b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, real, imag),
+                absImag, absReal)));
+
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 5080956a4589..9b5d19ebd783 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -956,7 +956,8 @@ struct NVGPUMBarrierTryWaitParityLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Value ticks = truncToI32(b, adaptor.getTicks());
-    Value phase = truncToI32(b, adaptor.getPhase());
+    Value phase =
+        b.create<LLVM::ZExtOp>(b.getI32Type(), adaptor.getPhaseParity());
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 1eb5678b4175..f4f6dadfb371 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -2010,7 +2010,8 @@ public:
     }
 
     bool didEncounterError = false;
-    auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs});
+    auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs},
+                                             rewriter.getContext());
     auto linalgOp = rewriter.create<linalg::GenericOp>(
         loc, ArrayRef<Type>({resultTy, resultMaxTy}), input,
         ValueRange({filledTensorIdx, filledTensorMax}), maps, iteratorTypes,
@@ -2351,9 +2352,11 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
         createZeroTensor(rewriter, loc, outputType, dynamicSizes)};
 
     // Indexing maps for input and output tensors
-    auto indexingMaps = AffineMap::inferFromExprList(llvm::ArrayRef{
-        affineDimsExpr(rewriter, 0, 3, 4), affineDimsExpr(rewriter, 0, 1, 2),
-        affineDimsExpr(rewriter, 0, 1, 2)});
+    auto indexingMaps = AffineMap::inferFromExprList(
+        llvm::ArrayRef{affineDimsExpr(rewriter, 0, 3, 4),
+                       affineDimsExpr(rewriter, 0, 1, 2),
+                       affineDimsExpr(rewriter, 0, 1, 2)},
+        rewriter.getContext());
 
     // Width and height dimensions of the original input.
     auto dimH = rewriter.createOrFold<tensor::DimOp>(loc, input, 1);
@@ -2463,7 +2466,8 @@ struct FFT2dConverter final : OpRewritePattern<FFT2dOp> {
         ArrayRef{RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4),
                  RFFT2dConverter::affineDimsExpr(rewriter, 0, 3, 4),
                  RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2),
-                 RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)});
+                 RFFT2dConverter::affineDimsExpr(rewriter, 0, 1, 2)},
+        rewriter.getContext());
 
     // Width and height dimensions of the original input.
     auto dimH = rewriter.createOrFold<tensor::DimOp>(loc, input_real, 1);
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index b63baf330c86..85fb8a539912 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -77,7 +77,9 @@ static void getXferIndices(RewriterBase &rewriter, TransferOpType xferOp,
 static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
                                           bool useNvGpu) {
   using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  auto infer = [&](MapList m) {
+    return AffineMap::inferFromExprList(m, contract.getContext());
+  };
   AffineExpr m, n, k;
   bindDims(contract.getContext(), m, n, k);
   auto iteratorTypes = contract.getIteratorTypes().getValue();
@@ -394,7 +396,9 @@ struct PrepareContractToGPUMMA
 
     // Set up the parallel/reduction structure in right form.
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr m, n, k;
     bindDims(rewriter.getContext(), m, n, k);
     static constexpr std::array<int64_t, 2> perm = {1, 0};
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index b66b55ae8d57..19cc914efae0 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1734,6 +1734,45 @@ struct VectorSplatNdOpLowering : public ConvertOpToLLVMPattern<SplatOp> {
   }
 };
 
+/// Conversion pattern for a `vector.interleave`.
+/// This supports fixed-sized vectors and scalable vectors.
+struct VectorInterleaveOpLowering
+    : public ConvertOpToLLVMPattern<vector::InterleaveOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::InterleaveOp interleaveOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = interleaveOp.getResultVectorType();
+    // n-D interleaves should have been lowered already.
+    if (resultType.getRank() != 1)
+      return rewriter.notifyMatchFailure(interleaveOp,
+                                         "InterleaveOp not rank 1");
+    // If the result is rank 1, then this directly maps to LLVM.
+    if (resultType.isScalable()) {
+      rewriter.replaceOpWithNewOp<LLVM::experimental_vector_interleave2>(
+          interleaveOp, typeConverter->convertType(resultType),
+          adaptor.getLhs(), adaptor.getRhs());
+      return success();
+    }
+    // Lower fixed-size interleaves to a shufflevector. While the
+    // vector.interleave2 intrinsic supports fixed and scalable vectors, the
+    // langref still recommends fixed-vectors use shufflevector, see:
+    // https://llvm.org/docs/LangRef.html#id876.
+    int64_t resultVectorSize = resultType.getNumElements();
+    SmallVector<int32_t> interleaveShuffleMask;
+    interleaveShuffleMask.reserve(resultVectorSize);
+    for (int i = 0, end = resultVectorSize / 2; i < end; ++i) {
+      interleaveShuffleMask.push_back(i);
+      interleaveShuffleMask.push_back((resultVectorSize / 2) + i);
+    }
+    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(
+        interleaveOp, adaptor.getLhs(), adaptor.getRhs(),
+        interleaveShuffleMask);
+    return success();
+  }
+};
+
 } // namespace
 
 /// Populate the given list with patterns that convert from Vector to LLVM.
@@ -1758,7 +1797,8 @@ void mlir::populateVectorToLLVMConversionPatterns(
                VectorExpandLoadOpConversion, VectorCompressStoreOpConversion,
                VectorSplatOpLowering, VectorSplatNdOpLowering,
                VectorScalableInsertOpLowering, VectorScalableExtractOpLowering,
-               MaskedReductionOpConversion>(converter);
+               MaskedReductionOpConversion, VectorInterleaveOpLowering>(
+      converter);
   // Transfer ops with rank > 1 are handled by VectorToSCF.
   populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1);
 }
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index adb56ab36438..c4b13193f4e7 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -1145,7 +1145,9 @@ AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
                                       ArrayRef<OpFoldResult> operands) {
   return makeComposedAffineApply(
-      b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}).front(),
+      b, loc,
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}, b.getContext())
+          .front(),
       operands);
 }
 
@@ -1220,7 +1222,9 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                             AffineExpr expr,
                                             ArrayRef<OpFoldResult> operands) {
   return makeComposedFoldedAffineApply(
-      b, loc, AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}).front(),
+      b, loc,
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}, b.getContext())
+          .front(),
       operands);
 }
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
index f69b2557eec9..d7492c9e25db 100644
--- a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -26,17 +26,18 @@ struct ConstantOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto constantOp = cast<arith::ConstantOp>(op);
+    auto type = constantOp.getType().dyn_cast<RankedTensorType>();
+
+    // Only ranked tensors are supported.
+    if (!type)
+      return failure();
 
     Attribute memorySpace;
-    if (options.defaultMemorySpace.has_value())
-      memorySpace = *options.defaultMemorySpace;
+    if (auto memSpace = options.defaultMemorySpaceFn(type))
+      memorySpace = *memSpace;
     else
       return constantOp->emitError("could not infer memory space");
 
-    // Only ranked tensors are supported.
-    if (!isa<RankedTensorType>(constantOp.getType()))
-      return failure();
-
     // Only constants inside a module are supported.
     auto moduleOp = constantOp->getParentOfType<ModuleOp>();
     if (!moduleOp)
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 6ca9702cbbc6..8f0f6d1fcc84 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -682,11 +682,12 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options,
     return bufferizableOp.getBufferType(value, options, invocationStack);
 
   // Op is not bufferizable.
-  if (!options.defaultMemorySpace.has_value())
+  auto memSpace =
+      options.defaultMemorySpaceFn(value.getType().cast<TensorType>());
+  if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{},
-                       *options.defaultMemorySpace);
+  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::hasTensorSemantics(Operation *op) {
@@ -936,11 +937,12 @@ FailureOr<BaseMemRefType> bufferization::detail::defaultGetBufferType(
 
   // If we do not know the memory space and there is no default memory space,
   // report a failure.
-  if (!options.defaultMemorySpace.has_value())
+  auto memSpace =
+      options.defaultMemorySpaceFn(value.getType().cast<TensorType>());
+  if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{},
-                       *options.defaultMemorySpace);
+  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::detail::defaultIsRepetitiveRegion(
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index eb4a96f35499..34a0c594a5a5 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -234,8 +234,8 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options,
     if (failed(copyBufferType))
       return failure();
     memorySpace = copyBufferType->getMemorySpace();
-  } else if (options.defaultMemorySpace.has_value()) {
-    memorySpace = *options.defaultMemorySpace;
+  } else if (auto ms = options.defaultMemorySpaceFn(getType())) {
+    memorySpace = *ms;
   } else {
     return getOperation()->emitError("could not infer memory space");
   }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index dc94b72edcdf..208cbda3a9eb 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -210,8 +210,12 @@ struct OneShotBufferizePass
       opt.dumpAliasSets = dumpAliasSets;
       opt.setFunctionBoundaryTypeConversion(
           parseLayoutMapOption(functionBoundaryTypeConversion));
-      if (mustInferMemorySpace)
-        opt.defaultMemorySpace = std::nullopt;
+      if (mustInferMemorySpace) {
+        opt.defaultMemorySpaceFn =
+            [](TensorType t) -> std::optional<Attribute> {
+          return std::nullopt;
+        };
+      }
       opt.printConflicts = printConflicts;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index 07cd1f90b17d..4cdbbf35dc87 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -66,7 +66,7 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index,
   assert(tensorType && "expected TensorType");
 
   BaseMemRefType memrefType = options.functionArgTypeConverterFn(
-      tensorType, *options.defaultMemorySpace, funcOp, options);
+      tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, options);
 
   auto layoutAttr = funcOp.getArgAttrOfType<AffineMapAttr>(
       index, BufferizationDialect::kBufferLayoutAttrName);
@@ -443,7 +443,8 @@ struct FuncOpInterface
       // Note: If `inferFunctionResultLayout = true`, cast are later folded
       // away.
       BaseMemRefType resultType = options.functionArgTypeConverterFn(
-          tensorType, *options.defaultMemorySpace, funcOp, options);
+          tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp,
+          options);
       Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
           loc, resultType, returnVal);
       returnValues.push_back(toMemrefOp);
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 0b4739214bf2..935f0deaf9c8 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -51,7 +51,6 @@ void buildCommonPassPipeline(
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertNVVMToLLVMPass());
-  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
 
@@ -98,6 +97,7 @@ void buildHostPostPipeline(OpPassManager &pm,
   GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
   gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
   pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a855e4b209ac..9b1ef084ee7f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1074,6 +1074,35 @@ LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
   return success();
 }
 
+LogicalResult NVVMDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIndex,
+                                                    unsigned argIndex,
+                                                    NamedAttribute argAttr) {
+  auto funcOp = dyn_cast<FunctionOpInterface>(op);
+  if (!funcOp)
+    return success();
+
+  bool isKernel = op->hasAttr(NVVMDialect::getKernelFuncAttrName());
+  StringAttr attrName = argAttr.getName();
+  if (attrName == NVVM::NVVMDialect::getGridConstantAttrName()) {
+    if (!isKernel) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute must be present only on kernel arguments";
+    }
+    if (!isa<UnitAttr>(argAttr.getValue()))
+      return op->emitError() << "'" << attrName << "' must be a unit attribute";
+    if (!funcOp.getArgAttr(argIndex, LLVM::LLVMDialect::getByValAttrName())) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute requires the argument to also have attribute '"
+             << LLVM::LLVMDialect::getByValAttrName() << "'";
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index e86b9762d858..a0f02f6a7f25 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1087,24 +1087,25 @@ LogicalResult GenericOp::verify() { return success(); }
 
 namespace {
 
-/// Remove generic operations (on tensors) that are just copying
+/// Remove any linalg operation (on tensors) that are just copying
 /// the values from inputs to the results. Requirements are
 /// 1) All iterator types are parallel
 /// 2) The body contains just a yield operation with the yielded values being
 ///    the arguments corresponding to the operands.
-struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
-  using OpRewritePattern<GenericOp>::OpRewritePattern;
+template <typename OpTy>
+struct EraseIdentityLinalgOp : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(GenericOp genericOp,
+  LogicalResult matchAndRewrite(OpTy linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Check all indexing maps are identity.
-    if (llvm::any_of(genericOp.getIndexingMapsArray(),
+    if (llvm::any_of(linalgOp.getIndexingMapsArray(),
                      [](AffineMap map) { return !map.isIdentity(); }))
       return failure();
 
     // Check that the body of the linalg operation is just a linalg.yield
     // operation.
-    Block &body = genericOp.getRegion().front();
+    Block &body = linalgOp->getRegion(0).front();
     if (!llvm::hasSingleElement(body))
       return failure();
     auto yieldOp = dyn_cast<linalg::YieldOp>(body.getTerminator());
@@ -1112,18 +1113,18 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
       return failure();
 
     // In the buffer case, we need to check exact buffer equality.
-    if (genericOp.hasPureBufferSemantics()) {
-      if (genericOp.getNumDpsInputs() == 1 && genericOp.getNumDpsInits() == 1 &&
-          genericOp.getDpsInputOperand(0)->get() ==
-              genericOp.getDpsInitOperand(0)->get()) {
-        rewriter.eraseOp(genericOp);
+    if (linalgOp.hasPureBufferSemantics()) {
+      if (linalgOp.getNumDpsInputs() == 1 && linalgOp.getNumDpsInits() == 1 &&
+          linalgOp.getDpsInputOperand(0)->get() ==
+              linalgOp.getDpsInitOperand(0)->get()) {
+        rewriter.eraseOp(linalgOp);
         return success();
       }
       return failure();
     }
 
     // Mixed semantics is not supported yet.
-    if (!genericOp.hasPureTensorSemantics())
+    if (!linalgOp.hasPureTensorSemantics())
       return failure();
 
     // Get the argument number of the returned values. That is the operand
@@ -1134,8 +1135,8 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
       if (!yieldArg || yieldArg.getOwner() != &body)
         return failure();
       unsigned argumentNumber = yieldArg.getArgNumber();
-      Value returnedArg = genericOp->getOperand(argumentNumber);
-      Type resultType = genericOp->getResult(yieldVal.index()).getType();
+      Value returnedArg = linalgOp->getOperand(argumentNumber);
+      Type resultType = linalgOp->getResult(yieldVal.index()).getType();
       // The input can have a different type than the result, e.g. a dynamic
       // input dimension can be turned into a static output dimension.
       Type returnType = returnedArg.getType();
@@ -1145,21 +1146,21 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
         if (sparse_tensor::getSparseTensorEncoding(returnType) ||
             sparse_tensor::getSparseTensorEncoding(resultType))
           returnedArg = rewriter.create<sparse_tensor::ConvertOp>(
-              genericOp.getLoc(), resultType, returnedArg);
+              linalgOp.getLoc(), resultType, returnedArg);
         else {
           if (!tensor::CastOp::areCastCompatible(returnedArg.getType(),
                                                  resultType))
             return failure();
           returnedArg = rewriter.create<tensor::CastOp>(
-              genericOp.getLoc(), resultType, returnedArg);
+              linalgOp.getLoc(), resultType, returnedArg);
         }
       }
       returnedArgs.push_back(returnedArg);
     }
 
-    if (returnedArgs.size() != genericOp->getNumResults())
+    if (returnedArgs.size() != linalgOp->getNumResults())
       return failure();
-    rewriter.replaceOp(genericOp, returnedArgs);
+    rewriter.replaceOp(linalgOp, returnedArgs);
     return success();
   }
 };
@@ -1168,7 +1169,7 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
 
 void GenericOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
-  results.add<EraseIdentityGenericOp>(context);
+  results.add<EraseIdentityLinalgOp<GenericOp>>(context);
 }
 
 LogicalResult GenericOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
@@ -1907,6 +1908,11 @@ void BroadcastOp::getEffects(
                         getDpsInits());
 }
 
+void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                              MLIRContext *context) {
+  results.add<EraseIdentityLinalgOp<BroadcastOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // YieldOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 80ce97ee3437..34c9b2c28296 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -73,16 +73,16 @@ static bool noAliasingUseInLoop(vector::TransferReadOp transferRead,
   return true;
 }
 
-void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) {
+void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) {
   bool changed = true;
   while (changed) {
     changed = false;
     // First move loop invariant ops outside of their loop. This needs to be
     // done before as we cannot move ops without interrupting the function walk.
-    func.walk(
+    root->walk(
         [&](LoopLikeOpInterface loopLike) { moveLoopInvariantCode(loopLike); });
 
-    func.walk([&](vector::TransferReadOp transferRead) {
+    root->walk([&](vector::TransferReadOp transferRead) {
       if (!isa<MemRefType>(transferRead.getShapedType()))
         return WalkResult::advance();
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
index 0174db45a83d..47b5fcd4014a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
@@ -83,7 +83,9 @@ linalg::splitOp(RewriterBase &rewriter, TilingInterface op, unsigned dimension,
   bindDims(rewriter.getContext(), d0, d1, d2);
   OpFoldResult minSplitPoint = affine::makeComposedFoldedAffineMin(
       rewriter, op.getLoc(),
-      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{d0, d1 + d2}).front(),
+      AffineMap::inferFromExprList(ArrayRef<AffineExpr>{d0, d1 + d2},
+                                   rewriter.getContext())
+          .front(),
       {splitPoint, offsets[dimension], sizes[dimension]});
 
   // Compute the size of the second part. Return early if the second part would
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 986b5f3e1fb6..5d220c6cdd7e 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -670,7 +670,8 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
                               << ": make sure in bound with affine.min\n");
 
       AffineExpr dim0, dim1, dim2;
-      bindDims(builder.getContext(), dim0, dim1, dim2);
+      MLIRContext *context = builder.getContext();
+      bindDims(context, dim0, dim1, dim2);
 
       // Get the dimension size for this dimension. We need to first calculate
       // the max index and then plus one. This is important because for
@@ -678,12 +679,12 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
       // form `(d0 * s0 + d1)`, where `d0`/`d1 is an output/filter window
       // dimension and `s0` is stride. Directly use the dimension size of
       // output/filer window dimensions will cause incorrect calculation.
-      AffineMap minusOneMap =
-          AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 - 1}})
-              .front();
-      AffineMap plusOneMap =
-          AffineMap::inferFromExprList({ArrayRef<AffineExpr>{dim0 + 1}})
-              .front();
+      AffineMap minusOneMap = AffineMap::inferFromExprList(
+                                  {ArrayRef<AffineExpr>{dim0 - 1}}, context)
+                                  .front();
+      AffineMap plusOneMap = AffineMap::inferFromExprList(
+                                 {ArrayRef<AffineExpr>{dim0 + 1}}, context)
+                                 .front();
       SmallVector<OpFoldResult> maxIndices =
           llvm::to_vector(llvm::map_range(ubs, [&](OpFoldResult ub) {
             return makeComposedFoldedAffineApply(rewriter, loc, minusOneMap,
@@ -696,7 +697,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
 
       // Compute min(dim - offset, size) to avoid out-of-bounds accesses.
       AffineMap minMap = AffineMap::inferFromExprList(
-                             {ArrayRef<AffineExpr>{dim1 - dim2, dim0}})
+                             {ArrayRef<AffineExpr>{dim1 - dim2, dim0}}, context)
                              .front();
       size =
           makeComposedFoldedAffineMin(rewriter, loc, minMap, {size, d, offset});
diff --git a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp b/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
index d281790e8771..5998133b7eab 100644
--- a/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/LegalizeToF32.cpp
@@ -76,20 +76,14 @@ LogicalResult LegalizeToF32RewritePattern::matchAndRewrite(
     ConversionPatternRewriter &rewriter) const {
   Location loc = op->getLoc();
   const TypeConverter *converter = getTypeConverter();
-  if (converter->isLegal(op))
-    return rewriter.notifyMatchFailure(loc, "op already legal");
-  OperationState newOp(loc, op->getName());
-  newOp.addOperands(operands);
+  FailureOr<Operation *> legalized =
+      convertOpResultTypes(op, operands, *converter, rewriter);
+  if (failed(legalized))
+    return failure();
 
-  SmallVector<Type> newResultTypes;
-  if (failed(converter->convertTypes(op->getResultTypes(), newResultTypes)))
-    return rewriter.notifyMatchFailure(loc, "couldn't convert return types");
-  newOp.addTypes(newResultTypes);
-  newOp.addAttributes(op->getAttrs());
-  Operation *legalized = rewriter.create(newOp);
-  SmallVector<Value> results = legalized->getResults();
-  for (auto [result, newType, origType] :
-       llvm::zip_equal(results, newResultTypes, op->getResultTypes())) {
+  SmallVector<Value> results = (*legalized)->getResults();
+  for (auto [result, newType, origType] : llvm::zip_equal(
+           results, (*legalized)->getResultTypes(), op->getResultTypes())) {
     if (newType != origType)
       result = rewriter.create<arith::TruncFOp>(loc, origType, result);
   }
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 4b6327479a21..26f831f10a4e 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -362,7 +362,8 @@ std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
                              << kMaxTMADimension << " but it is " << dim;
     }
   }
-  if (descMemref.getRank() > 1) {
+  if (descMemref.getRank() > 1 &&
+      descType.getSwizzle() != TensorMapSwizzleKind::SWIZZLE_NONE) {
     unsigned lastDimensionByte =
         descMemref.getElementTypeBitWidth() * descMemref.getShape().back() / 8;
     if (lastDimensionByte != kMaxTMALastdimByte)
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index c81742233e6d..1635297a5447 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -1010,7 +1010,8 @@ void HopperBuilder::buildBarrierArriveTx(
 
 void HopperBuilder::buildTryWaitParity(
     TypedValue<nvgpu::MBarrierGroupType> barrier) {
-  Value parity = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Type i1 = rewriter.getI1Type();
+  Value parity = rewriter.create<LLVM::ConstantOp>(loc, i1, 0);
   // 10M is an arbitrary, not too small or too big number to specify the number
   // of ticks before retry.
   // TODO: hoist this in a default dialect constant.
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 381f17d08041..849449f9127d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Interfaces/FoldInterfaces.h"
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -34,6 +35,7 @@
 #include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc"
 #include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc"
+#include "mlir/Support/LogicalResult.h"
 
 using namespace mlir;
 using namespace mlir::omp;
@@ -427,6 +429,71 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // Parser, printer and verifier for ReductionVarList
 //===----------------------------------------------------------------------===//
 
+ParseResult
+parseReductionClause(OpAsmParser &parser, Region &region,
+                     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
+                     SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols,
+                     SmallVectorImpl<OpAsmParser::Argument> &privates) {
+  if (failed(parser.parseOptionalKeyword("reduction")))
+    return failure();
+
+  SmallVector<SymbolRefAttr> reductionVec;
+
+  if (failed(
+          parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren, [&]() {
+            if (parser.parseAttribute(reductionVec.emplace_back()) ||
+                parser.parseOperand(operands.emplace_back()) ||
+                parser.parseArrow() ||
+                parser.parseArgument(privates.emplace_back()) ||
+                parser.parseColonType(types.emplace_back()))
+              return failure();
+            return success();
+          })))
+    return failure();
+
+  for (auto [prv, type] : llvm::zip_equal(privates, types)) {
+    prv.type = type;
+  }
+  SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
+  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  return success();
+}
+
+static void printReductionClause(OpAsmPrinter &p, Operation *op, Region &region,
+                                 ValueRange operands, TypeRange types,
+                                 ArrayAttr reductionSymbols) {
+  p << "reduction(";
+  llvm::interleaveComma(llvm::zip_equal(reductionSymbols, operands,
+                                        region.front().getArguments(), types),
+                        p, [&p](auto t) {
+                          auto [sym, op, arg, type] = t;
+                          p << sym << " " << op << " -> " << arg << " : "
+                            << type;
+                        });
+  p << ") ";
+}
+
+static ParseResult
+parseParallelRegion(OpAsmParser &parser, Region &region,
+                    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
+                    SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols) {
+
+  llvm::SmallVector<OpAsmParser::Argument> privates;
+  if (succeeded(parseReductionClause(parser, region, operands, types,
+                                     reductionSymbols, privates)))
+    return parser.parseRegion(region, privates);
+
+  return parser.parseRegion(region);
+}
+
+static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
+                                ValueRange operands, TypeRange types,
+                                ArrayAttr reductionSymbols) {
+  if (reductionSymbols)
+    printReductionClause(p, op, region, operands, types, reductionSymbols);
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= symbol-ref `->` ssa-id `:` type
@@ -561,7 +628,7 @@ static LogicalResult verifyDependVarList(Operation *op,
       return op->emitOpError() << "expected as many depend values"
                                   " as depend variables";
   } else {
-    if (depends)
+    if (depends && !depends->empty())
       return op->emitOpError() << "unexpected depend values";
     return success();
   }
@@ -965,19 +1032,31 @@ LogicalResult DataOp::verify() {
 }
 
 LogicalResult EnterDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult ExitDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult UpdateDataOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 LogicalResult TargetOp::verify() {
-  return verifyMapClause(*this, getMapOperands());
+  LogicalResult verifyDependVars =
+      verifyDependVarList(*this, getDepends(), getDependVars());
+  return failed(verifyDependVars) ? verifyDependVars
+                                  : verifyMapClause(*this, getMapOperands());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1114,6 +1193,7 @@ parseLoopControl(OpAsmParser &parser, Region &region,
   loopVarTypes = SmallVector<Type>(ivs.size(), loopVarType);
   for (auto &iv : ivs)
     iv.type = loopVarType;
+
   return parser.parseRegion(region, ivs);
 }
 
@@ -1594,6 +1674,73 @@ LogicalResult DataBoundsOp::verify() {
   return success();
 }
 
+LogicalResult PrivateClauseOp::verify() {
+  Type symType = getType();
+
+  auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
+    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+      return mlir::emitError(terminator->getLoc())
+             << "expected exit block terminator to be an `omp.yield` op.";
+
+    YieldOp yieldOp = llvm::cast<YieldOp>(terminator);
+    TypeRange yieldedTypes = yieldOp.getResults().getTypes();
+
+    if (yieldedTypes.size() == 1 && yieldedTypes.front() == symType)
+      return success();
+
+    auto error = mlir::emitError(yieldOp.getLoc())
+                 << "Invalid yielded value. Expected type: " << symType
+                 << ", got: ";
+
+    if (yieldedTypes.empty())
+      error << "None";
+    else
+      error << yieldedTypes;
+
+    return error;
+  };
+
+  auto verifyRegion = [&](Region &region, unsigned expectedNumArgs,
+                          StringRef regionName) -> LogicalResult {
+    assert(!region.empty());
+
+    if (region.getNumArguments() != expectedNumArgs)
+      return mlir::emitError(region.getLoc())
+             << "`" << regionName << "`: "
+             << "expected " << expectedNumArgs
+             << " region arguments, got: " << region.getNumArguments();
+
+    for (Block &block : region) {
+      // MLIR will verify the absence of the terminator for us.
+      if (!block.mightHaveTerminator())
+        continue;
+
+      if (failed(verifyTerminator(block.getTerminator())))
+        return failure();
+    }
+
+    return success();
+  };
+
+  if (failed(verifyRegion(getAllocRegion(), /*expectedNumArgs=*/1, "alloc")))
+    return failure();
+
+  DataSharingClauseType dsType = getDataSharingType();
+
+  if (dsType == DataSharingClauseType::Private && !getCopyRegion().empty())
+    return emitError("`private` clauses require only an `alloc` region.");
+
+  if (dsType == DataSharingClauseType::FirstPrivate && getCopyRegion().empty())
+    return emitError(
+        "`firstprivate` clauses require both `alloc` and `copy` regions.");
+
+  if (dsType == DataSharingClauseType::FirstPrivate &&
+      failed(verifyRegion(getCopyRegion(), /*expectedNumArgs=*/2, "copy")))
+    return failure();
+
+  return success();
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 9822ee522c6e..119df9acd9e9 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1190,7 +1190,7 @@ void ForallOp::print(OpAsmPrinter &p) {
                           OpAsmParser::Delimiter::Paren);
     p << " step ";
     printDynamicIndexList(p, op, getDynamicStep(), getStaticStep(),
-                          /*valueTypes=*/{}, /*scalable=*/{},
+                          /*valueTypes=*/{}, /*scalables=*/{},
                           OpAsmParser::Delimiter::Paren);
   }
   printInitializationList(p, getRegionOutArgs(), getOutputs(), " shared_outs");
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 90f935d71c2f..2a16b10bbaf8 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -463,7 +463,7 @@ DenseSet<int64_t> getEquivalentBuffers(Block::BlockArgListType bbArgs,
 /// Helper function for loop bufferization. Return the bufferized values of the
 /// given OpOperands. If an operand is not a tensor, return the original value.
 static FailureOr<SmallVector<Value>>
-getBuffers(RewriterBase &rewriter, MutableOperandRange operands,
+getBuffers(RewriterBase &rewriter, const MutableOperandRange &operands,
            const BufferizationOptions &options) {
   SmallVector<Value> result;
   for (OpOperand &opOperand : operands) {
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index fdaeb2fad9af..e5494205e086 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   ParallelLoopTiling.cpp
   StructuralTypeConversions.cpp
   TileUsingInterface.cpp
+  WrapInZeroTripCheck.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SCF
diff --git a/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp
new file mode 100644
index 000000000000..f829208ce879
--- /dev/null
+++ b/mlir/lib/Dialect/SCF/Transforms/WrapInZeroTripCheck.cpp
@@ -0,0 +1,132 @@
+//===- WrapInZeroTripCheck.cpp - Loop transforms to add zero-trip-check ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+
+/// Create zero-trip-check around a `while` op and return the new loop op in the
+/// check. The while loop is rotated to avoid evaluating the condition twice.
+///
+/// Given an example below:
+///
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+/// First clone before block to the front of the loop:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.while (%arg0 = %init) : (i32) -> i64 {
+///     %val = .., %arg0 : i64
+///     %cond = arith.cmpi .., %arg0 : i32
+///     scf.condition(%cond) %val : i64
+///   } do {
+///   ^bb0(%arg1: i64):
+///     %next = .., %arg1 : i32
+///     scf.yield %next : i32
+///   }
+///
+/// Create `if` op with the condition, rotate and move the loop into the else
+/// branch:
+///
+///   %pre_val = .., %init : i64
+///   %pre_cond = arith.cmpi .., %init : i32
+///   scf.if %pre_cond -> i64 {
+///     %res = scf.while (%arg1 = %va0) : (i64) -> i64 {
+///       // Original after block
+///       %next = .., %arg1 : i32
+///       // Original before block
+///       %val = .., %next : i64
+///       %cond = arith.cmpi .., %next : i32
+///       scf.condition(%cond) %val : i64
+///     } do {
+///     ^bb0(%arg2: i64):
+///       %scf.yield %arg2 : i32
+///     }
+///     scf.yield %res : i64
+///   } else {
+///     scf.yield %pre_val : i64
+///   }
+FailureOr<scf::WhileOp> mlir::scf::wrapWhileLoopInZeroTripCheck(
+    scf::WhileOp whileOp, RewriterBase &rewriter, bool forceCreateCheck) {
+  // If the loop is in do-while form (after block only passes through values),
+  // there is no need to create a zero-trip-check as before block is always run.
+  if (!forceCreateCheck && isa<scf::YieldOp>(whileOp.getAfterBody()->front())) {
+    return whileOp;
+  }
+
+  OpBuilder::InsertionGuard insertion_guard(rewriter);
+
+  IRMapping mapper;
+  Block *beforeBlock = whileOp.getBeforeBody();
+  // Clone before block before the loop for zero-trip-check.
+  for (auto [arg, init] :
+       llvm::zip_equal(beforeBlock->getArguments(), whileOp.getInits())) {
+    mapper.map(arg, init);
+  }
+  rewriter.setInsertionPoint(whileOp);
+  for (auto &op : *beforeBlock) {
+    if (isa<scf::ConditionOp>(op)) {
+      break;
+    }
+    // Safe to clone everything as in a single block all defs have been cloned
+    // and added to mapper in order.
+    rewriter.insert(op.clone(mapper));
+  }
+
+  scf::ConditionOp condOp = whileOp.getConditionOp();
+  Value clonedCondition = mapper.lookupOrDefault(condOp.getCondition());
+  SmallVector<Value> clonedCondArgs = llvm::map_to_vector(
+      condOp.getArgs(), [&](Value arg) { return mapper.lookupOrDefault(arg); });
+
+  // Create rotated while loop.
+  auto newLoopOp = rewriter.create<scf::WhileOp>(
+      whileOp.getLoc(), whileOp.getResultTypes(), clonedCondArgs,
+      [&](OpBuilder &builder, Location loc, ValueRange args) {
+        // Rotate and move the loop body into before block.
+        auto newBlock = builder.getBlock();
+        rewriter.mergeBlocks(whileOp.getAfterBody(), newBlock, args);
+        auto yieldOp = cast<scf::YieldOp>(newBlock->getTerminator());
+        rewriter.mergeBlocks(whileOp.getBeforeBody(), newBlock,
+                             yieldOp.getResults());
+        rewriter.eraseOp(yieldOp);
+      },
+      [&](OpBuilder &builder, Location loc, ValueRange args) {
+        // Pass through values.
+        builder.create<scf::YieldOp>(loc, args);
+      });
+
+  // Create zero-trip-check and move the while loop in.
+  auto ifOp = rewriter.create<scf::IfOp>(
+      whileOp.getLoc(), clonedCondition,
+      [&](OpBuilder &builder, Location loc) {
+        // Then runs the while loop.
+        rewriter.moveOpBefore(newLoopOp, builder.getInsertionBlock(),
+                              builder.getInsertionPoint());
+        builder.create<scf::YieldOp>(loc, newLoopOp.getResults());
+      },
+      [&](OpBuilder &builder, Location loc) {
+        // Else returns the results from precondition.
+        builder.create<scf::YieldOp>(loc, clonedCondArgs);
+      });
+
+  rewriter.replaceOp(whileOp, ifOp);
+
+  return newLoopOp;
+}
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index cdd85ddeb93a..536c02feca1b 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -634,9 +634,9 @@ void mlir::collapseParallelLoops(
   SmallVector<Value, 3> lowerBounds, upperBounds, steps;
   auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
   auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-  for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
+  for (auto &sortedDimension : sortedDimensions) {
     Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
-    for (auto idx : sortedDimensions[i]) {
+    for (auto idx : sortedDimension) {
       newUpperBound = outsideBuilder.create<arith::MulIOp>(
           loc, newUpperBound, normalizedUpperBounds[idx]);
     }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index eb7ea63a4e88..0fb0d2761054 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -29,12 +29,29 @@ using namespace mlir::sparse_tensor::ir_detail;
 // `LvlTypeParser` implementation.
 //===----------------------------------------------------------------------===//
 
-FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
+FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   StringRef base;
   const auto loc = parser.getCurrentLocation();
   ERROR_IF(failed(parser.parseOptionalKeyword(&base)),
            "expected valid level format (e.g. dense, compressed or singleton)")
-  uint8_t properties = 0;
+  uint64_t properties = 0;
+  SmallVector<unsigned> structured;
+
+  if (base.compare("structured") == 0) {
+    ParseResult res = parser.parseCommaSeparatedList(
+        mlir::OpAsmParser::Delimiter::OptionalSquare,
+        [&]() -> ParseResult { return parseStructured(parser, &structured); },
+        " in structured n out of m");
+    FAILURE_IF_FAILED(res)
+    if (structured.size() != 2) {
+      parser.emitError(loc, "expected exactly 2 structured sizes");
+      return failure();
+    }
+    if (structured[0] > structured[1]) {
+      parser.emitError(loc, "expected n <= m in n_out_of_m");
+      return failure();
+    }
+  }
 
   ParseResult res = parser.parseCommaSeparatedList(
       mlir::OpAsmParser::Delimiter::OptionalParen,
@@ -44,15 +61,16 @@ FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
 
   // Set the base bit for properties.
   if (base.compare("dense") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Dense);
+    properties |= static_cast<uint64_t>(LevelFormat::Dense);
   } else if (base.compare("compressed") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Compressed);
-  } else if (base.compare("block2_4") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::TwoOutOfFour);
+    properties |= static_cast<uint64_t>(LevelFormat::Compressed);
+  } else if (base.compare("structured") == 0) {
+    properties |= static_cast<uint64_t>(LevelFormat::NOutOfM);
+    properties |= nToBits(structured[0]) | mToBits(structured[1]);
   } else if (base.compare("loose_compressed") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::LooseCompressed);
+    properties |= static_cast<uint64_t>(LevelFormat::LooseCompressed);
   } else if (base.compare("singleton") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::Singleton);
+    properties |= static_cast<uint64_t>(LevelFormat::Singleton);
   } else {
     parser.emitError(loc, "unknown level format: ") << base;
     return failure();
@@ -64,15 +82,15 @@ FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
 }
 
 ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
-                                         uint8_t *properties) const {
+                                         uint64_t *properties) const {
   StringRef strVal;
   auto loc = parser.getCurrentLocation();
   ERROR_IF(failed(parser.parseOptionalKeyword(&strVal)),
            "expected valid level property (e.g. nonordered, nonunique or high)")
   if (strVal.compare("nonunique") == 0) {
-    *properties |= static_cast<uint8_t>(LevelPropertyNondefault::Nonunique);
+    *properties |= static_cast<uint64_t>(LevelPropertyNondefault::Nonunique);
   } else if (strVal.compare("nonordered") == 0) {
-    *properties |= static_cast<uint8_t>(LevelPropertyNondefault::Nonordered);
+    *properties |= static_cast<uint64_t>(LevelPropertyNondefault::Nonordered);
   } else {
     parser.emitError(loc, "unknown level property: ") << strVal;
     return failure();
@@ -80,4 +98,26 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
   return success();
 }
 
+ParseResult
+LvlTypeParser::parseStructured(AsmParser &parser,
+                               SmallVector<unsigned> *structured) const {
+  int intVal;
+  auto loc = parser.getCurrentLocation();
+  OptionalParseResult intValParseResult = parser.parseOptionalInteger(intVal);
+  if (intValParseResult.has_value()) {
+    if (failed(*intValParseResult)) {
+      parser.emitError(loc, "failed to parse structured size");
+      return failure();
+    }
+    if (intVal < 0) {
+      parser.emitError(loc, "expected structured size to be >= 0");
+      return failure();
+    }
+    structured->push_back(intVal);
+    return success();
+  }
+  parser.emitError(loc, "expected valid integer for structured size");
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
index 5e2f11b75d4d..1ac8254454ec 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.h
@@ -18,10 +18,12 @@ namespace ir_detail {
 class LvlTypeParser {
 public:
   LvlTypeParser() = default;
-  FailureOr<uint8_t> parseLvlType(AsmParser &parser) const;
+  FailureOr<uint64_t> parseLvlType(AsmParser &parser) const;
 
 private:
-  ParseResult parseProperty(AsmParser &parser, uint8_t *properties) const;
+  ParseResult parseProperty(AsmParser &parser, uint64_t *properties) const;
+  ParseResult parseStructured(AsmParser &parser,
+                              SmallVector<unsigned> *structured) const;
 };
 
 } // namespace ir_detail
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 27125bc7ed45..aed43f26d54f 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -613,16 +613,28 @@ void SparseTensorEncodingAttr::printDimensions(
   }
 }
 
+std::string getNOutOfMString(LevelType lt) {
+  if (isNOutOfMLT(lt)) {
+    unsigned n = getN(lt);
+    unsigned m = getM(lt);
+    auto output = "[" + std::to_string(n) + ", " + std::to_string(m) + "]";
+    return output;
+  }
+  return "";
+}
+
 void SparseTensorEncodingAttr::printLevels(AffineMap &map, AsmPrinter &printer,
                                            ArrayRef<LevelType> lvlTypes) const {
   for (unsigned i = 0, n = map.getNumResults() - 1; i < n; i++) {
     map.getResult(i).print(printer.getStream());
-    printer << " : " << toMLIRString(lvlTypes[i]) << ", ";
+    printer << " : " << toMLIRString(lvlTypes[i])
+            << getNOutOfMString(lvlTypes[i]) << ", ";
   }
   if (map.getNumResults() >= 1) {
     auto lastIndex = map.getNumResults() - 1;
     map.getResult(lastIndex).print(printer.getStream());
-    printer << " : " << toMLIRString(lvlTypes[lastIndex]);
+    printer << " : " << toMLIRString(lvlTypes[lastIndex])
+            << getNOutOfMString(lvlTypes[lastIndex]);
   }
 }
 
@@ -645,6 +657,37 @@ LogicalResult SparseTensorEncodingAttr::verify(
       return emitError() << "expected all singleton lvlTypes "
                             "following a singleton level";
   }
+  // TODO: audit formats that actually are supported by backend.
+  if (auto it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isNOutOfMLT);
+      it != std::end(lvlTypes)) {
+    if (it != lvlTypes.end() - 1)
+      return emitError() << "expected n_out_of_m to be the last level type";
+    if (!std::all_of(lvlTypes.begin(), it,
+                     [](LevelType i) { return isDenseLT(i); }))
+      return emitError() << "expected all dense lvlTypes "
+                            "before a n_out_of_m level";
+    if (dimToLvl && (dimToLvl.getNumDims() != dimToLvl.getNumResults())) {
+      if (!isBlockSparsity(dimToLvl)) {
+        return emitError()
+               << "expected 1xm block structure for n_out_of_m level";
+      }
+      auto sizes = getBlockSize(dimToLvl);
+      unsigned coefficient = 0;
+      for (const auto &elem : sizes) {
+        if (elem != 0) {
+          if (elem != coefficient && coefficient != 0) {
+            return emitError() << "expected only one blocked level "
+                                  "with the same coefficients";
+          }
+          coefficient = elem;
+        }
+      }
+      if (coefficient != getM(*it)) {
+        return emitError() << "expected coeffiencts of Affine expressions "
+                              "to be equal to m of n_out_of_m level";
+      }
+    }
+  }
   // Before we can check that the level-rank is consistent/coherent
   // across all fields, we need to define it.  The source-of-truth for
   // the `getLvlRank` method is the length of the level-types array,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 87a37a7926e9..cdee8a46f551 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -77,8 +77,8 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
   // Then we insert a new kernel with given arguments into the module.
   builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front());
   SmallVector<Type> argsTp;
-  for (unsigned i = 0, e = args.size(); i < e; i++)
-    argsTp.push_back(args[i].getType());
+  for (auto arg : args)
+    argsTp.push_back(arg.getType());
   FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
   auto gpuFunc =
       builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
@@ -451,7 +451,7 @@ static bool isAdmissibleBSR(SparseTensorType &aTp) {
 /// Test for 2:4 matrix with suitable metadata.
 static bool isAdmissible24(SparseTensorType &aTp) {
   return aTp.getDimRank() == 2 && aTp.getLvlRank() == 3 && aTp.isDenseLvl(0) &&
-         aTp.isDenseLvl(1) && aTp.is2OutOf4Lvl(2) && isAdmissibleMetaData(aTp);
+         aTp.isDenseLvl(1) && aTp.isNOutOfMLvl(2) && isAdmissibleMetaData(aTp);
 }
 
 /// Test for conversion into 2:4 matrix.
@@ -1263,7 +1263,9 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
     SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
 
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr i, j, k;
     bindDims(getContext(), i, j, k);
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 491501a3381b..d4459c6ea1e5 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -130,7 +130,7 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc,
       createPushback(builder, loc, desc, SparseTensorFieldKind::PosMemRef, lvl,
                      /*value=*/posZero, /*repeat=*/linear);
       return;
-    } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) {
+    } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) {
       return; // nothing to do
     }
     // Keep compounding the size, but nothing needs to be initialized
@@ -409,7 +409,7 @@ static void genEndInsert(OpBuilder &builder, Location loc,
       }
     } else {
       assert(isDenseLT(lt) || isLooseCompressedLT(lt) || isSingletonLT(lt) ||
-             is2OutOf4LT(lt));
+             isNOutOfMLT(lt));
     }
   }
 }
@@ -488,7 +488,7 @@ public:
         }
         parentPos =
             genCompressed(builder, loc, desc, coords, value, parentPos, lvl);
-      } else if (isSingletonLT(lt) || is2OutOf4LT(lt)) {
+      } else if (isSingletonLT(lt) || isNOutOfMLT(lt)) {
         // Create:
         //   coordinates[lvl].push_back(coords[lvl])
         //   positions[lvl] = positions[lvl-1]
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 1883cf1ceed5..235c5453f9cc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -543,14 +543,14 @@ public:
     if (!op.hasPureTensorSemantics() || op.getNumDpsInputs() != 1 ||
         op.getNumReductionLoops() == 0 || op.getNumResults() != 1)
       return failure();
-    auto inp = op.getDpsInputOperand(0);
-    auto init = op.getDpsInitOperand(0);
+    auto *inp = op.getDpsInputOperand(0);
+    auto *init = op.getDpsInitOperand(0);
     if (!isSparseTensor(inp))
       return failure();
     // Look for direct x = x OP y for semi-ring ready reductions.
-    auto red = cast<linalg::YieldOp>(op.getRegion().front().getTerminator())
-                   .getOperand(0)
-                   .getDefiningOp();
+    auto *red = cast<linalg::YieldOp>(op.getRegion().front().getTerminator())
+                    .getOperand(0)
+                    .getDefiningOp();
     if (!isa<arith::AndIOp, arith::MulIOp, arith::MulFOp, arith::MinimumFOp,
              arith::MinSIOp, arith::MinUIOp, arith::MaximumFOp, arith::MaxSIOp,
              arith::MaxUIOp>(red))
@@ -592,7 +592,7 @@ public:
     IRMapping irMap;
     irMap.map(red->getOperand(0), region->getArgument(0));
     irMap.map(red->getOperand(1), region->getArgument(1));
-    auto cloned = rewriter.clone(*red, irMap);
+    auto *cloned = rewriter.clone(*red, irMap);
     rewriter.create<sparse_tensor::YieldOp>(loc, cloned->getResult(0));
     rewriter.setInsertionPointAfter(custom);
     rewriter.replaceOp(red, custom.getResult());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index ab38ab5cc3f7..8f2ae60b311f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -891,7 +891,7 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr,
         assert(curr == env.merger().loop(b));
         Value clause;
         if (isCompressedLT(lt) || isSingletonLT(lt) ||
-            isLooseCompressedLT(lt) || is2OutOf4LT(lt)) {
+            isLooseCompressedLT(lt) || isNOutOfMLT(lt)) {
           assert(lvl.has_value());
           const Value crd = env.emitter().getCoord(tid, *lvl);
           const Value lvar = env.getLoopVar(curr);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 1c2857d868a6..0ead135c90d3 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -94,7 +94,7 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
   this->loopTag = loopTag;
   this->hasOutput = hasOutput;
   this->isSparseOut = isSparseOut;
-  SparseIterator::setSparseEmitStrategy(emitStrategy);
+  this->emitStrategy = emitStrategy;
 
   const unsigned numManifestTensors = ts.size();
   const unsigned synTensorId = numManifestTensors;
@@ -166,13 +166,13 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
 std::unique_ptr<SparseIterator>
 LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t,
                                Level l) {
-  auto it = makeSimpleIterator(*lvls[t][l]);
+  auto it = makeSimpleIterator(*lvls[t][l], emitStrategy);
   auto stt = getSparseTensorType(tensors[t]);
   if (stt.hasEncoding() && stt.getEncoding().isSlice()) {
     Value offset = genSliceOffset(builder, loc, tensors[t], l);
     Value stride = genSliceStride(builder, loc, tensors[t], l);
-    auto slicedIt = makeSlicedLevelIterator(std::move(it), offset, stride,
-                                            lvls[t][l]->getSize());
+    auto slicedIt = makeSlicedLevelIterator(
+        std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy);
     return slicedIt;
   }
   return it;
@@ -186,7 +186,7 @@ void LoopEmitter::initializeLoopEmit(
     TensorId synId = getSynTensorId();
     for (unsigned i = 0, e = loopHighs.size(); i < e; i++) {
       Value sz = loopHighs[i] = synSetter(builder, loc, i);
-      auto [stl, it] = makeSynLevelAndIterator(sz, synId, i);
+      auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy);
       lvls[synId][i] = std::move(stl);
       iters[synId][i].emplace_back(std::move(it));
     }
@@ -317,12 +317,13 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
           size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1)));
         }
         it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop],
-                                         std::move(lvlIt), size, curDep.second);
+                                         std::move(lvlIt), size, curDep.second,
+                                         emitStrategy);
       } else {
         const SparseIterator &subSectIter = *iters[t][lvl].back();
         it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent,
                                          std::move(lvlIt), loopHighs[loop],
-                                         curDep.second);
+                                         curDep.second, emitStrategy);
       }
       lastIter[t] = it.get();
       iters[t][lvl].emplace_back(std::move(it));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 5bab2c6a8608..7bfe713cdd9f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -380,6 +380,7 @@ private:
   /// tensor.
   bool hasOutput;
   bool isSparseOut;
+  SparseEmitStrategy emitStrategy;
 
   //
   // Fields which have `numTensor` many entries.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
index 04b49c320f07..c85f8204ba75 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
@@ -139,18 +139,19 @@ public:
   }
 };
 
-class TwoOutFourLevel : public SparseLevel {
+class NOutOfMLevel : public SparseLevel {
 public:
-  TwoOutFourLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize,
-                  Value crdBuffer)
+  NOutOfMLevel(unsigned tid, Level lvl, LevelType lt, Value lvlSize,
+               Value crdBuffer)
       : SparseLevel(tid, lvl, lt, lvlSize, crdBuffer) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, Value p,
                         Value max) const override {
-    assert(max == nullptr && isUnique() && "2:4 level can not be non-unique.");
-    // Each 2:4 blk has exactly two specified elements.
-    Value posLo = MULI(p, C_IDX(2));
-    return {posLo, ADDI(posLo, C_IDX(2))};
+    assert(max == nullptr && isUnique() && "n:m level can not be non-unique.");
+    // Each n:m blk has exactly n specified elements.
+    auto n = getN(lt);
+    Value posLo = MULI(p, C_IDX(n));
+    return {posLo, ADDI(posLo, C_IDX(n))};
   }
 };
 
@@ -773,9 +774,6 @@ public:
 // SparseIterator derived classes implementation.
 //===----------------------------------------------------------------------===//
 
-SparseEmitStrategy SparseIterator::emitStrategy =
-    SparseEmitStrategy::kFunctional;
-
 void SparseIterator::genInit(OpBuilder &b, Location l,
                              const SparseIterator *p) {
   if (emitStrategy == SparseEmitStrategy::kDebugInterface) {
@@ -1294,36 +1292,47 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
     Value crd = genToCoordinates(b, l, t, lvl);
     return std::make_unique<SingletonLevel>(tid, lvl, lt, sz, crd);
   }
-  case LevelFormat::TwoOutOfFour: {
+  case LevelFormat::NOutOfM: {
     Value crd = genToCoordinates(b, l, t, lvl);
-    return std::make_unique<TwoOutFourLevel>(tid, lvl, lt, sz, crd);
+    return std::make_unique<NOutOfMLevel>(tid, lvl, lt, sz, crd);
   }
   }
   llvm_unreachable("unrecognizable level format");
 }
 
 std::pair<std::unique_ptr<SparseTensorLevel>, std::unique_ptr<SparseIterator>>
-sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl) {
+sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
+                                       SparseEmitStrategy strategy) {
   auto stl = std::make_unique<DenseLevel>(tid, lvl, sz, /*encoded=*/false);
   auto it = std::make_unique<TrivialIterator>(*stl);
+  it->setSparseEmitStrategy(strategy);
   return std::make_pair(std::move(stl), std::move(it));
 }
 
 std::unique_ptr<SparseIterator>
-sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl) {
+sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl,
+                                  SparseEmitStrategy strategy) {
+  std::unique_ptr<SparseIterator> ret;
   if (!isUniqueLT(stl.getLT())) {
     // We always dedupliate the non-unique level, but we should optimize it away
     // if possible.
-    return std::make_unique<DedupIterator>(stl);
+    ret = std::make_unique<DedupIterator>(stl);
+  } else {
+    ret = std::make_unique<TrivialIterator>(stl);
   }
-  return std::make_unique<TrivialIterator>(stl);
+  ret->setSparseEmitStrategy(strategy);
+  return ret;
 }
 
 std::unique_ptr<SparseIterator>
 sparse_tensor::makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit,
-                                       Value offset, Value stride, Value size) {
+                                       Value offset, Value stride, Value size,
+                                       SparseEmitStrategy strategy) {
 
-  return std::make_unique<FilterIterator>(std::move(sit), offset, stride, size);
+  auto ret =
+      std::make_unique<FilterIterator>(std::move(sit), offset, stride, size);
+  ret->setSparseEmitStrategy(strategy);
+  return ret;
 }
 
 static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
@@ -1335,38 +1344,42 @@ static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
 
 std::unique_ptr<SparseIterator> sparse_tensor::makeNonEmptySubSectIterator(
     OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound,
-    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride) {
+    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride,
+    SparseEmitStrategy strategy) {
 
   // Try unwrap the NonEmptySubSectIterator from a filter parent.
   parent = tryUnwrapFilter(parent);
-  auto it = std::make_unique<NonEmptySubSectIterator>(
-      b, l, parent, std::move(delegate), size);
+  std::unique_ptr<SparseIterator> it =
+      std::make_unique<NonEmptySubSectIterator>(b, l, parent,
+                                                std::move(delegate), size);
 
   if (stride != 1) {
     // TODO: We can safely skip bound checking on sparse levels, but for dense
     // iteration space, we need the bound to infer the dense loop range.
-    return std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
-                                            C_IDX(stride), /*size=*/loopBound);
+    it = std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
+                                          C_IDX(stride), /*size=*/loopBound);
   }
+  it->setSparseEmitStrategy(strategy);
   return it;
 }
 
 std::unique_ptr<SparseIterator> sparse_tensor::makeTraverseSubSectIterator(
     OpBuilder &b, Location l, const SparseIterator &subSectIter,
     const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
-    Value loopBound, unsigned stride) {
+    Value loopBound, unsigned stride, SparseEmitStrategy strategy) {
 
   // This must be a subsection iterator or a filtered subsection iterator.
   auto &subSect =
       llvm::cast<NonEmptySubSectIterator>(*tryUnwrapFilter(&subSectIter));
 
-  auto it = std::make_unique<SubSectIterator>(
+  std::unique_ptr<SparseIterator> it = std::make_unique<SubSectIterator>(
       subSect, *tryUnwrapFilter(&parent), std::move(wrap));
 
   if (stride != 1) {
-    return std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
-                                            C_IDX(stride), /*size=*/loopBound);
+    it = std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
+                                          C_IDX(stride), /*size=*/loopBound);
   }
+  it->setSparseEmitStrategy(strategy);
   return it;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
index fc2d9de66cfe..d1e94b790bea 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.h
@@ -111,8 +111,8 @@ protected:
 public:
   virtual ~SparseIterator() = default;
 
-  static void setSparseEmitStrategy(SparseEmitStrategy strategy) {
-    SparseIterator::emitStrategy = strategy;
+  void setSparseEmitStrategy(SparseEmitStrategy strategy) {
+    emitStrategy = strategy;
   }
 
   virtual std::string getDebugInterfacePrefix() const = 0;
@@ -248,7 +248,7 @@ protected:
     return ref.take_front(cursorValsCnt);
   }
 
-  static SparseEmitStrategy emitStrategy;
+  SparseEmitStrategy emitStrategy;
 
 public:
   const IterKind kind;     // For LLVM-style RTTI.
@@ -277,32 +277,34 @@ std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &builder,
 
 /// Helper function to create a simple SparseIterator object that iterate over
 /// the SparseTensorLevel.
-std::unique_ptr<SparseIterator>
-makeSimpleIterator(const SparseTensorLevel &stl);
+std::unique_ptr<SparseIterator> makeSimpleIterator(const SparseTensorLevel &stl,
+                                                   SparseEmitStrategy strategy);
 
 /// Helper function to create a synthetic SparseIterator object that iterate
 /// over a dense space specified by [0,`sz`).
 std::pair<std::unique_ptr<SparseTensorLevel>, std::unique_ptr<SparseIterator>>
-makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl);
+makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
+                        SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over a
 /// sliced space, the orignal space (before slicing) is traversed by `sit`.
 std::unique_ptr<SparseIterator>
 makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit, Value offset,
-                        Value stride, Value size);
+                        Value stride, Value size, SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over the
 /// non-empty subsections set.
 std::unique_ptr<SparseIterator> makeNonEmptySubSectIterator(
     OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound,
-    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride);
+    std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride,
+    SparseEmitStrategy strategy);
 
 /// Helper function to create a SparseIterator object that iterate over a
 /// non-empty subsection created by NonEmptySubSectIterator.
 std::unique_ptr<SparseIterator> makeTraverseSubSectIterator(
     OpBuilder &b, Location l, const SparseIterator &subsectIter,
     const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
-    Value loopBound, unsigned stride);
+    Value loopBound, unsigned stride, SparseEmitStrategy strategy);
 
 } // namespace sparse_tensor
 } // namespace mlir
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 6cdf5f8c0168..96537cbb0c48 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -489,7 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
       if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !is2OutOf4LT(lt)) {
+          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,7 +670,7 @@ bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
     if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        is2OutOf4LT(lt))
+        isNOutOfMLT(lt))
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index b21e89ae3a57..8298cf102e28 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2663,8 +2663,8 @@ struct InsertSliceOpSourceCastInserter final
     if (!hasValidSizesOffsets(newSrcShape))
       return failure();
 
-    RankedTensorType newSrcType =
-        RankedTensorType::get(newSrcShape, srcType.getElementType());
+    RankedTensorType newSrcType = RankedTensorType::get(
+        newSrcShape, srcType.getElementType(), srcType.getEncoding());
     if (srcType == newSrcType ||
         !preservesStaticInformation(srcType, newSrcType) ||
         !tensor::CastOp::areCastCompatible(srcType, newSrcType))
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 678b7c099fa3..957f6314f358 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -473,14 +473,14 @@ struct FromElementsOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto fromElementsOp = cast<tensor::FromElementsOp>(op);
+    auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
 
     // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
+    if (options.defaultMemorySpaceFn(tensorType) != Attribute())
       return op->emitError("memory space not implemented yet");
 
     // Allocate a buffer for the result.
     Location loc = op->getLoc();
-    auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
     auto shape = tensorType.getShape();
     // TODO: Create alloc_tensor ops during TensorCopyInsertion.
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
@@ -588,8 +588,10 @@ struct GenerateOpInterface
                           const BufferizationOptions &options) const {
     auto generateOp = cast<tensor::GenerateOp>(op);
 
+    auto type = generateOp.getResult().getType();
+
     // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
+    if (options.defaultMemorySpaceFn(type) != Attribute())
       return op->emitError("memory space not implemented yet");
 
     // Allocate memory.
@@ -1007,10 +1009,6 @@ struct SplatOpInterface
     OpBuilder::InsertionGuard g(rewriter);
     auto splatOp = cast<tensor::SplatOp>(op);
 
-    // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpace != Attribute())
-      return op->emitError("memory space not implemented yet");
-
     // Allocate memory.
     Location loc = op->getLoc();
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
@@ -1021,6 +1019,11 @@ struct SplatOpInterface
 
     // Create linalg::MapOp.
     auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
+
+    // TODO: Implement memory space for this op.
+    if (options.defaultMemorySpaceFn(tensorType) != Attribute())
+      return op->emitError("memory space not implemented yet");
+
     auto linalgOp =
         rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
                                        /*init=*/*tensorAlloc);
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index c875519945b9..5073234a7e35 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
@@ -64,6 +65,20 @@ public:
         transform::detail::getPreloadedTransformModule(context);
     Operation *payloadRoot =
         findPayloadRoot(getOperation(), debugPayloadRootTag);
+    if (!payloadRoot)
+      return signalPassFailure();
+    auto debugBindNames = llvm::map_to_vector(
+        debugBindTrailingArgs,
+        [&](const std::string &name) { return OperationName(name, context); });
+    SmallVector<SmallVector<Operation *>, 2> trailingBindings;
+    trailingBindings.resize(debugBindNames.size());
+    payloadRoot->walk([&](Operation *payload) {
+      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
+        if (payload->getName() == name)
+          trailingBindings[position].push_back(payload);
+      }
+    });
+
     Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
         getOperation(), transformModule, entryPoint);
     if (!transformEntryPoint) {
@@ -73,8 +88,15 @@ public:
       return signalPassFailure();
     }
 
+    RaggedArray<transform::MappedValue> bindings;
+    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
+    for (SmallVector<Operation *> &trailing : trailingBindings)
+      bindings.push_back(std::move(trailing));
+
     if (failed(transform::applyTransformNamedSequence(
-            payloadRoot, transformEntryPoint, transformModule,
+            bindings,
+            cast<transform::TransformOpInterface>(transformEntryPoint),
+            transformModule,
             options.enableExpensiveChecks(!disableExpensiveChecks)))) {
       return signalPassFailure();
     }
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
index 2f74b76f07b7..8a9cd7c52d82 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
@@ -191,22 +191,46 @@ LogicalResult transform::detail::assembleTransformLibraryFromPaths(
 LogicalResult transform::applyTransformNamedSequence(
     Operation *payload, Operation *transformRoot, ModuleOp transformModule,
     const TransformOptions &options) {
+  RaggedArray<MappedValue> bindings;
+  bindings.push_back(ArrayRef<Operation *>{payload});
+  return applyTransformNamedSequence(bindings,
+                                     cast<TransformOpInterface>(transformRoot),
+                                     transformModule, options);
+}
+
+LogicalResult transform::applyTransformNamedSequence(
+    RaggedArray<MappedValue> bindings, TransformOpInterface transformRoot,
+    ModuleOp transformModule, const TransformOptions &options) {
+  if (bindings.empty()) {
+    return transformRoot.emitError()
+           << "expected at least one binding for the root";
+  }
+  if (bindings.at(0).size() != 1) {
+    return transformRoot.emitError()
+           << "expected one payload to be bound to the first argument, got "
+           << bindings.at(0).size();
+  }
+  auto *payloadRoot = bindings.at(0).front().dyn_cast<Operation *>();
+  if (!payloadRoot) {
+    return transformRoot->emitError() << "expected the object bound to the "
+                                         "first argument to be an operation";
+  }
+
+  bindings.removeFront();
+
   // `transformModule` may not be modified.
   if (transformModule && !transformModule->isAncestor(transformRoot)) {
     OwningOpRef<Operation *> clonedTransformModule(transformModule->clone());
     if (failed(detail::mergeSymbolsInto(
             SymbolTable::getNearestSymbolTable(transformRoot),
             std::move(clonedTransformModule)))) {
-      return payload->emitError() << "failed to merge symbols";
+      return payloadRoot->emitError() << "failed to merge symbols";
     }
   }
 
   LLVM_DEBUG(DBGS() << "Apply\n" << *transformRoot << "\n");
-  LLVM_DEBUG(DBGS() << "To\n" << *payload << "\n");
+  LLVM_DEBUG(DBGS() << "To\n" << *payloadRoot << "\n");
 
-  // Apply the transform to the IR, do not enforce top-level constraints.
-  RaggedArray<MappedValue> noExtraMappings;
-  return applyTransforms(payload, cast<TransformOpInterface>(transformRoot),
-                         noExtraMappings, options,
+  return applyTransforms(payloadRoot, transformRoot, bindings, options,
                          /*enforceToplevelTransformOp=*/false);
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 452354413e88..5be6a628904c 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -675,9 +675,10 @@ void vector::ContractionOp::build(OpBuilder &builder, OperationState &result,
                                   ArrayRef<IteratorType> iteratorTypes) {
   result.addOperands({lhs, rhs, acc});
   result.addTypes(acc.getType());
-  result.addAttribute(getIndexingMapsAttrName(result.name),
-                      builder.getAffineMapArrayAttr(
-                          AffineMap::inferFromExprList(indexingExprs)));
+  result.addAttribute(
+      getIndexingMapsAttrName(result.name),
+      builder.getAffineMapArrayAttr(
+          AffineMap::inferFromExprList(indexingExprs, builder.getContext())));
   result.addAttribute(
       getIteratorTypesAttrName(result.name),
       builder.getArrayAttr(llvm::to_vector(llvm::map_range(
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index daf28882976e..adf961ff935f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   VectorEmulateMaskedLoadStore.cpp
   VectorEmulateNarrowType.cpp
   VectorInsertExtractStridedSliceRewritePatterns.cpp
+  VectorLinearize.cpp
   VectorTransferOpTransforms.cpp
   VectorTransferSplitRewritePatterns.cpp
   VectorTransforms.cpp
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 446eb853d2e9..0eaf9f71a37d 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -695,7 +695,9 @@ ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
   Value lhs = op.getLhs(), rhs = op.getRhs();
 
   using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  auto infer = [&](MapList m) {
+    return AffineMap::inferFromExprList(m, op.getContext());
+  };
   AffineExpr m, n, k;
   bindDims(rewriter.getContext(), m, n, k);
   SmallVector<AffineMap> maps = op.getIndexingMapsArray();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index e1ed5d81625d..74382b027c2f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -73,7 +73,8 @@ struct CastAwayExtractStridedSliceLeadingOneDim
     VectorType oldDstType = extractOp.getType();
     VectorType newDstType =
         VectorType::get(oldDstType.getShape().drop_front(dropCount),
-                        oldDstType.getElementType());
+                        oldDstType.getElementType(),
+                        oldDstType.getScalableDims().drop_front(dropCount));
 
     Location loc = extractOp.getLoc();
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
new file mode 100644
index 000000000000..c53520439555
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -0,0 +1,97 @@
+//===- VectorLinearize.cpp - vector linearization transforms --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns and pass for linearizing ND vectors into 1D.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+namespace {
+struct LinearizeConstant final : OpConversionPattern<arith::ConstantOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp constOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = constOp.getLoc();
+    auto resType =
+        getTypeConverter()->convertType<VectorType>(constOp.getType());
+    if (!resType)
+      return rewriter.notifyMatchFailure(loc, "can't convert return type");
+
+    auto dstElementsAttr = dyn_cast<DenseElementsAttr>(constOp.getValue());
+    if (!dstElementsAttr)
+      return rewriter.notifyMatchFailure(loc, "unsupported attr type");
+
+    dstElementsAttr = dstElementsAttr.reshape(resType);
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(constOp, resType,
+                                                   dstElementsAttr);
+    return success();
+  }
+};
+
+struct LinearizeVectorizable final
+    : OpTraitConversionPattern<OpTrait::Vectorizable> {
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    FailureOr<Operation *> newOp =
+        convertOpResultTypes(op, operands, *getTypeConverter(), rewriter);
+    if (failed(newOp))
+      return failure();
+
+    rewriter.replaceOp(op, (*newOp)->getResults());
+    return success();
+  }
+};
+} // namespace
+
+void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target) {
+  typeConverter.addConversion([](VectorType type) -> std::optional<Type> {
+    // Ignore scalable vectors for now.
+    if (type.getRank() <= 1 || type.isScalable())
+      return type;
+
+    return VectorType::get(type.getNumElements(), type.getElementType());
+  });
+
+  auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
+                            Location loc) -> Value {
+    if (inputs.size() != 1 || !isa<VectorType>(inputs.front().getType()) ||
+        !isa<VectorType>(type))
+      return nullptr;
+
+    return builder.create<vector::ShapeCastOp>(loc, type, inputs.front());
+  };
+  typeConverter.addArgumentMaterialization(materializeCast);
+  typeConverter.addSourceMaterialization(materializeCast);
+  typeConverter.addTargetMaterialization(materializeCast);
+
+  target.markUnknownOpDynamicallyLegal(
+      [&](Operation *op) -> std::optional<bool> {
+        if (isa<arith::ConstantOp>(op) || op->hasTrait<OpTrait::Vectorizable>())
+          return typeConverter.isLegal(op);
+
+        return std::nullopt;
+      });
+
+  patterns.add<LinearizeConstant, LinearizeVectorizable>(typeConverter,
+                                                         patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index f1a27168bd4e..b844c2bfa837 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -209,7 +209,7 @@ createSubViewIntersection(RewriterBase &b, VectorTransferOpInterface xferOp,
     AffineExpr i, j, k;
     bindDims(xferOp.getContext(), i, j, k);
     SmallVector<AffineMap, 4> maps =
-        AffineMap::inferFromExprList(MapList{{i - j, k}});
+        AffineMap::inferFromExprList(MapList{{i - j, k}}, b.getContext());
     // affine_min(%dimMemRef - %index, %dimAlloc)
     Value affineMin = b.create<affine::AffineMinOp>(
         loc, index.getType(), maps[0], ValueRange{dimMemRef, index, dimAlloc});
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 4034dc40685a..53ae138d1e43 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -160,8 +160,9 @@ struct MultiReduceToContract
         iteratorTypes.push_back(vector::IteratorType::reduction);
       }
     }
-    auto dstMap = AffineMap::get(/*dimCount=*/reductionMask.size(),
-                                 /*symCount=*/0, exprs, reduceOp.getContext());
+    auto dstMap =
+        AffineMap::get(/*dimCount=*/reductionMask.size(),
+                       /*symbolCount=*/0, exprs, reduceOp.getContext());
     rewriter.replaceOpWithNewOp<mlir::vector::ContractionOp>(
         reduceOp, mulOp->getOperand(0), mulOp->getOperand(1), reduceOp.getAcc(),
         rewriter.getAffineMapArrayAttr({srcMap, srcMap, dstMap}),
@@ -1399,7 +1400,9 @@ struct CanonicalizeContractMatmulToMMT final
 
     // Set up the parallel/reduction structure in right form.
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, op.getContext());
+    };
     AffineExpr m;
     AffineExpr n;
     AffineExpr k;
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 0c7b3a228a65..9e8b240899d8 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -45,7 +45,7 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
   for (uint64_t l = 0; l < lvlRank; l++) {
     assert(lvlSizes[l] > 0 && "Level size zero has trivial storage");
     assert(isDenseLvl(l) || isCompressedLvl(l) || isLooseCompressedLvl(l) ||
-           isSingletonLvl(l) || is2OutOf4Lvl(l));
+           isSingletonLvl(l) || isNOutOfMLvl(l));
   }
 }
 
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
index c28046266359..4aa0d4f34a09 100644
--- a/mlir/lib/IR/AffineMap.cpp
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -272,12 +272,16 @@ AffineMap AffineMap::getMultiDimMapWithTargets(unsigned numDims,
   return result;
 }
 
+/// Creates an affine map each for each list of AffineExpr's in `exprsList`
+/// while inferring the right number of dimensional and symbolic inputs needed
+/// based on the maximum dimensional and symbolic identifier appearing in the
+/// expressions.
 template <typename AffineExprContainer>
 static SmallVector<AffineMap, 4>
-inferFromExprList(ArrayRef<AffineExprContainer> exprsList) {
-  assert(!exprsList.empty());
-  assert(!exprsList[0].empty());
-  auto context = exprsList[0][0].getContext();
+inferFromExprList(ArrayRef<AffineExprContainer> exprsList,
+                  MLIRContext *context) {
+  if (exprsList.empty())
+    return {};
   int64_t maxDim = -1, maxSym = -1;
   getMaxDimAndSymbol(exprsList, maxDim, maxSym);
   SmallVector<AffineMap, 4> maps;
@@ -289,13 +293,15 @@ inferFromExprList(ArrayRef<AffineExprContainer> exprsList) {
 }
 
 SmallVector<AffineMap, 4>
-AffineMap::inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList) {
-  return ::inferFromExprList(exprsList);
+AffineMap::inferFromExprList(ArrayRef<ArrayRef<AffineExpr>> exprsList,
+                             MLIRContext *context) {
+  return ::inferFromExprList(exprsList, context);
 }
 
 SmallVector<AffineMap, 4>
-AffineMap::inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList) {
-  return ::inferFromExprList(exprsList);
+AffineMap::inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList,
+                             MLIRContext *context) {
+  return ::inferFromExprList(exprsList, context);
 }
 
 uint64_t AffineMap::getLargestKnownDivisorOfMapExprs() {
@@ -521,7 +527,7 @@ AffineMap::replace(const DenseMap<AffineExpr, AffineExpr> &map) const {
   newResults.reserve(getNumResults());
   for (AffineExpr e : getResults())
     newResults.push_back(e.replace(map));
-  return AffineMap::inferFromExprList(newResults).front();
+  return AffineMap::inferFromExprList(newResults, getContext()).front();
 }
 
 AffineMap AffineMap::dropResults(const llvm::SmallBitVector &positions) const {
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 6b8b7473bf0f..6bed90936465 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -74,11 +74,9 @@ MLIRContext *AsmParser::getContext() const { return getBuilder().getContext(); }
 /// Parse a type list.
 /// This is out-of-line to work-around https://github.com/llvm/llvm-project/issues/62918
 ParseResult AsmParser::parseTypeList(SmallVectorImpl<Type> &result) {
-    return parseCommaSeparatedList(
-        [&]() { return parseType(result.emplace_back()); });
-  }
-
-
+  return parseCommaSeparatedList(
+      [&]() { return parseType(result.emplace_back()); });
+}
 
 //===----------------------------------------------------------------------===//
 // DialectAsmPrinter
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 9b8ee3d45280..1794b38478a7 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -921,7 +921,7 @@ AffineExpr mlir::makeCanonicalStridedLayoutExpr(ArrayRef<int64_t> sizes,
     return getAffineConstantExpr(0, context);
 
   assert(!exprs.empty() && "expected exprs");
-  auto maps = AffineMap::inferFromExprList(exprs);
+  auto maps = AffineMap::inferFromExprList(exprs, context);
   assert(!maps.empty() && "Expected one non-empty map");
   unsigned numDims = maps[0].getNumDims(), nSymbols = maps[0].getNumSymbols();
 
diff --git a/mlir/lib/Target/LLVMIR/AttrKindDetail.h b/mlir/lib/Target/LLVMIR/AttrKindDetail.h
index 7f81777886f5..b01858ea8143 100644
--- a/mlir/lib/Target/LLVMIR/AttrKindDetail.h
+++ b/mlir/lib/Target/LLVMIR/AttrKindDetail.h
@@ -59,6 +59,20 @@ getAttrKindToNameMapping() {
   return kindNamePairs;
 }
 
+/// Returns a dense map from LLVM attribute name to their kind in LLVM IR
+/// dialect.
+static llvm::DenseMap<llvm::StringRef, llvm::Attribute::AttrKind>
+getAttrNameToKindMapping() {
+  static auto attrNameToKindMapping = []() {
+    llvm::DenseMap<llvm::StringRef, llvm::Attribute::AttrKind> nameKindMap;
+    for (auto kindNamePair : getAttrKindToNameMapping()) {
+      nameKindMap.insert({kindNamePair.second, kindNamePair.first});
+    }
+    return nameKindMap;
+  }();
+  return attrNameToKindMapping;
+}
+
 } // namespace detail
 } // namespace LLVM
 } // namespace mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 45eb8402a734..ea9fe2635461 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -201,6 +201,62 @@ public:
     }
     return success();
   }
+
+  LogicalResult
+  convertParameterAttr(LLVMFuncOp funcOp, int argIdx, NamedAttribute attribute,
+                       LLVM::ModuleTranslation &moduleTranslation) const final {
+
+    llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
+    llvm::Function *llvmFunc =
+        moduleTranslation.lookupFunction(funcOp.getName());
+    llvm::NamedMDNode *nvvmAnnotations =
+        moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations");
+
+    if (attribute.getName() == NVVM::NVVMDialect::getGridConstantAttrName()) {
+      llvm::MDNode *gridConstantMetaData = nullptr;
+
+      // Check if a 'grid_constant' metadata node exists for the given function
+      for (llvm::MDNode *opnd : llvm::reverse(nvvmAnnotations->operands())) {
+        if (opnd->getNumOperands() == 3 &&
+            opnd->getOperand(0) == llvm::ValueAsMetadata::get(llvmFunc) &&
+            opnd->getOperand(1) ==
+                llvm::MDString::get(llvmContext, "grid_constant")) {
+          gridConstantMetaData = opnd;
+          break;
+        }
+      }
+
+      // 'grid_constant' is a function-level meta data node with a list of
+      // integers, where each integer n denotes that the nth parameter has the
+      // grid_constant annotation (numbering from 1). This requires aggregating
+      // the indices of the individual parameters that have this attribute.
+      llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
+      if (gridConstantMetaData == nullptr) {
+        // Create a new 'grid_constant' metadata node
+        SmallVector<llvm::Metadata *> gridConstMetadata = {
+            llvm::ValueAsMetadata::getConstant(
+                llvm::ConstantInt::get(i32, argIdx + 1))};
+        llvm::Metadata *llvmMetadata[] = {
+            llvm::ValueAsMetadata::get(llvmFunc),
+            llvm::MDString::get(llvmContext, "grid_constant"),
+            llvm::MDNode::get(llvmContext, gridConstMetadata)};
+        llvm::MDNode *llvmMetadataNode =
+            llvm::MDNode::get(llvmContext, llvmMetadata);
+        nvvmAnnotations->addOperand(llvmMetadataNode);
+      } else {
+        // Append argIdx + 1 to the 'grid_constant' argument list
+        if (auto argList =
+                dyn_cast<llvm::MDTuple>(gridConstantMetaData->getOperand(2))) {
+          llvm::TempMDTuple clonedArgList = argList->clone();
+          clonedArgList->push_back((llvm::ValueAsMetadata::getConstant(
+              llvm::ConstantInt::get(i32, argIdx + 1))));
+          gridConstantMetaData->replaceOperandWith(
+              2, llvm::MDNode::replaceWithUniqued(std::move(clonedArgList)));
+        }
+      }
+    }
+    return success();
+  }
 };
 } // namespace
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 79956f82ed14..c87e895bb540 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1018,9 +1018,19 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                       reductionDecls, privateReductionVariables,
-                       reductionVariableMap);
+    {
+      llvm::IRBuilderBase::InsertPointGuard guard(builder);
+      builder.restoreIP(allocaIP);
+      auto args = opInst.getRegion().getArguments();
+
+      for (std::size_t i = 0; i < opInst.getNumReductionVars(); ++i) {
+        llvm::Value *var = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+        moduleTranslation.mapValue(args[i], var);
+        privateReductionVariables.push_back(var);
+        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], var);
+      }
+    }
 
     // Store the mapping between reduction variables and their private copies on
     // ModuleTranslation stack. It can be then recovered when translating
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index a54221580b28..ee8fffd959c8 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1298,28 +1298,29 @@ static void convertFunctionAttributes(LLVMFuncOp func,
   llvmFunc->setMemoryEffects(newMemEffects);
 }
 
-llvm::AttrBuilder
-ModuleTranslation::convertParameterAttrs(DictionaryAttr paramAttrs) {
+FailureOr<llvm::AttrBuilder>
+ModuleTranslation::convertParameterAttrs(LLVMFuncOp func, int argIdx,
+                                         DictionaryAttr paramAttrs) {
   llvm::AttrBuilder attrBuilder(llvmModule->getContext());
-
-  for (auto [llvmKind, mlirName] : getAttrKindToNameMapping()) {
-    Attribute attr = paramAttrs.get(mlirName);
-    // Skip attributes that are not present.
-    if (!attr)
-      continue;
-
-    // NOTE: C++17 does not support capturing structured bindings.
-    llvm::Attribute::AttrKind llvmKindCap = llvmKind;
-
-    llvm::TypeSwitch<Attribute>(attr)
-        .Case<TypeAttr>([&](auto typeAttr) {
-          attrBuilder.addTypeAttr(llvmKindCap,
-                                  convertType(typeAttr.getValue()));
-        })
-        .Case<IntegerAttr>([&](auto intAttr) {
-          attrBuilder.addRawIntAttr(llvmKindCap, intAttr.getInt());
-        })
-        .Case<UnitAttr>([&](auto) { attrBuilder.addAttribute(llvmKindCap); });
+  auto attrNameToKindMapping = getAttrNameToKindMapping();
+
+  for (auto namedAttr : paramAttrs) {
+    auto it = attrNameToKindMapping.find(namedAttr.getName());
+    if (it != attrNameToKindMapping.end()) {
+      llvm::Attribute::AttrKind llvmKind = it->second;
+
+      llvm::TypeSwitch<Attribute>(namedAttr.getValue())
+          .Case<TypeAttr>([&](auto typeAttr) {
+            attrBuilder.addTypeAttr(llvmKind, convertType(typeAttr.getValue()));
+          })
+          .Case<IntegerAttr>([&](auto intAttr) {
+            attrBuilder.addRawIntAttr(llvmKind, intAttr.getInt());
+          })
+          .Case<UnitAttr>([&](auto) { attrBuilder.addAttribute(llvmKind); });
+    } else if (namedAttr.getNameDialect()) {
+      if (failed(iface.convertParameterAttr(func, argIdx, namedAttr, *this)))
+        return failure();
+    }
   }
 
   return attrBuilder;
@@ -1348,14 +1349,21 @@ LogicalResult ModuleTranslation::convertFunctionSignatures() {
     // Convert result attributes.
     if (ArrayAttr allResultAttrs = function.getAllResultAttrs()) {
       DictionaryAttr resultAttrs = cast<DictionaryAttr>(allResultAttrs[0]);
-      llvmFunc->addRetAttrs(convertParameterAttrs(resultAttrs));
+      FailureOr<llvm::AttrBuilder> attrBuilder =
+          convertParameterAttrs(function, -1, resultAttrs);
+      if (failed(attrBuilder))
+        return failure();
+      llvmFunc->addRetAttrs(*attrBuilder);
     }
 
     // Convert argument attributes.
     for (auto [argIdx, llvmArg] : llvm::enumerate(llvmFunc->args())) {
       if (DictionaryAttr argAttrs = function.getArgAttrDict(argIdx)) {
-        llvm::AttrBuilder attrBuilder = convertParameterAttrs(argAttrs);
-        llvmArg.addAttrs(attrBuilder);
+        FailureOr<llvm::AttrBuilder> attrBuilder =
+            convertParameterAttrs(function, argIdx, argAttrs);
+        if (failed(attrBuilder))
+          return failure();
+        llvmArg.addAttrs(*attrBuilder);
       }
     }
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index e90447084d68..a5a77e00fbfb 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -3130,6 +3130,27 @@ struct AnyFunctionOpInterfaceSignatureConversion
 };
 } // namespace
 
+FailureOr<Operation *>
+mlir::convertOpResultTypes(Operation *op, ValueRange operands,
+                           const TypeConverter &converter,
+                           ConversionPatternRewriter &rewriter) {
+  assert(op && "Invalid op");
+  Location loc = op->getLoc();
+  if (converter.isLegal(op))
+    return rewriter.notifyMatchFailure(loc, "op already legal");
+
+  OperationState newOp(loc, op->getName());
+  newOp.addOperands(operands);
+
+  SmallVector<Type> newResultTypes;
+  if (failed(converter.convertTypes(op->getResultTypes(), newResultTypes)))
+    return rewriter.notifyMatchFailure(loc, "couldn't convert return types");
+
+  newOp.addTypes(newResultTypes);
+  newOp.addAttributes(op->getAttrs());
+  return rewriter.create(newOp);
+}
+
 void mlir::populateFunctionOpInterfaceTypeConversionPattern(
     StringRef functionLikeOpName, RewritePatternSet &patterns,
     const TypeConverter &converter) {
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 57a85990f9bc..344abb64a57d 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -985,6 +985,7 @@ class Context:
     def _get_context_again(self) -> Context: ...
     def _get_live_module_count(self) -> int: ...
     def _get_live_operation_count(self) -> int: ...
+    def _get_live_operation_objects(self) -> List[Operation]: ...
     def append_dialect_registry(self, registry: DialectRegistry) -> None: ...
     def attach_diagnostic_handler(
         self, callback: Callable[[Diagnostic], bool]
diff --git a/mlir/python/mlir/dialects/transform/loop.py b/mlir/python/mlir/dialects/transform/loop.py
index 3bdd9ca3b22f..c4770b1c4067 100644
--- a/mlir/python/mlir/dialects/transform/loop.py
+++ b/mlir/python/mlir/dialects/transform/loop.py
@@ -55,6 +55,7 @@ class LoopPeelOp(LoopPeelOp):
         remainder_loop_type: Type,
         target: Union[Operation, Value],
         *,
+        peel_front: Union[bool, BoolAttr] = False,
         fail_if_already_divisible: Union[bool, BoolAttr] = False,
         ip=None,
         loc=None,
@@ -63,6 +64,11 @@ class LoopPeelOp(LoopPeelOp):
             main_loop_type,
             remainder_loop_type,
             _get_op_result_or_value(target),
+            peel_front=(
+                peel_front
+                if isinstance(peel_front, BoolAttr)
+                else BoolAttr.get(peel_front)
+            ),
             fail_if_already_divisible=(
                 fail_if_already_divisible
                 if isinstance(fail_if_already_divisible, BoolAttr)
diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c
index 2c6ad559f19d..a8b9f9048d59 100644
--- a/mlir/test/CAPI/sparse_tensor.c
+++ b/mlir/test/CAPI/sparse_tensor.c
@@ -38,9 +38,9 @@ static int testRoundtripEncoding(MlirContext ctx) {
       mlirSparseTensorEncodingAttrGetDimToLvl(originalAttr);
   // CHECK: (d0, d1)[s0] -> (s0, d0, d1)
   mlirAffineMapDump(dimToLvl);
-  // CHECK: level_type: 4
-  // CHECK: level_type: 8
-  // CHECK: level_type: 8
+  // CHECK: level_type: 65536
+  // CHECK: level_type: 131072
+  // CHECK: level_type: 131072
   MlirAffineMap lvlToDim =
       mlirSparseTensorEncodingAttrGetLvlToDim(originalAttr);
   int lvlRank = mlirSparseTensorEncodingGetLvlRank(originalAttr);
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 8fa29ea43854..1fe843b1447a 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -7,13 +7,30 @@ func.func @complex_abs(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg: complex<f32>
   return %abs : f32
 }
+
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
-// CHECK: return %[[NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
+// CHECK: return %[[ABS3]] : f32
 
 // -----
 
@@ -241,12 +258,28 @@ func.func @complex_log(%arg: complex<f32>) -> complex<f32> {
   %log = complex.log %arg: complex<f32>
   return %log : complex<f32>
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
@@ -469,12 +502,28 @@ func.func @complex_sign(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[REAL_IS_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
 // CHECK: %[[IMAG_IS_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
 // CHECK: %[[IS_ZERO:.*]] = arith.andi %[[REAL_IS_ZERO]], %[[IMAG_IS_ZERO]] : i1
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL2]], %[[REAL2]] : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG2]], %[[IMAG2]] : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL2]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG2]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG2]], %[[REAL2]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL2]] : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL2]], %[[IMAG2]] : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG2]] : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL2]], %[[IMAG2]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[REAL_SIGN:.*]] = arith.divf %[[REAL]], %[[NORM]] : f32
 // CHECK: %[[IMAG_SIGN:.*]] = arith.divf %[[IMAG]], %[[NORM]] : f32
 // CHECK: %[[SIGN:.*]] = complex.create %[[REAL_SIGN]], %[[IMAG_SIGN]] : complex<f32>
@@ -716,13 +765,29 @@ func.func @complex_abs_with_fmf(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg fastmath<nnan,contract> : complex<f32>
   return %abs : f32
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[REAL_SQ:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[REAL_SQ]], %[[IMAG_SQ]] fastmath<nnan,contract> : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
-// CHECK: return %[[NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[ABS3:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
+// CHECK: return %[[ABS3]] : f32
 
 // -----
 
@@ -807,12 +872,28 @@ func.func @complex_log_with_fmf(%arg: complex<f32>) -> complex<f32> {
   %log = complex.log %arg fastmath<nnan,contract> : complex<f32>
   return %log : complex<f32>
 }
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[IS_REAL_ZERO:.*]] = arith.cmpf oeq, %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_DIV_REAL:.*]] = arith.divf %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ:.*]] = arith.mulf %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = arith.addf %[[IMAG_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_SQRT:.*]] = math.sqrt %[[IMAG_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_ABS:.*]] = math.absf %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_IMAG:.*]] = arith.mulf %[[IMAG_SQRT]], %[[REAL_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_DIV_IMAG:.*]] = arith.divf %[[REAL]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ:.*]] = arith.mulf %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = arith.addf %[[REAL_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_SQRT:.*]] = math.sqrt %[[REAL_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_ABS:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[ABS_REAL:.*]] = arith.mulf %[[REAL_SQRT]], %[[IMAG_ABS]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_GT_IMAG:.*]] = arith.cmpf ogt, %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = arith.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : f32
+// CHECK: %[[ABS2:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[REAL_ABS]], %[[ABS1]] : f32
+// CHECK: %[[NORM:.*]] = arith.select %[[IS_REAL_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : f32
 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] fastmath<nnan,contract> : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
diff --git a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
index 9983dd46f094..0f23e20167f4 100644
--- a/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/full-conversion.mlir
@@ -6,12 +6,31 @@ func.func @complex_abs(%arg: complex<f32>) -> f32 {
   %abs = complex.abs %arg: complex<f32>
   return %abs : f32
 }
+// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
+// CHECK: %[[ONE:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
 // CHECK: %[[REAL:.*]] = llvm.extractvalue %[[ARG]][0] : ![[C_TY]]
 // CHECK: %[[IMAG:.*]] = llvm.extractvalue %[[ARG]][1] : ![[C_TY]]
-// CHECK-DAG: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL]], %[[REAL]]  : f32
-// CHECK-DAG: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG]], %[[IMAG]]  : f32
-// CHECK: %[[SQ_NORM:.*]] = llvm.fadd %[[REAL_SQ]], %[[IMAG_SQ]]  : f32
-// CHECK: %[[NORM:.*]] = llvm.intr.sqrt(%[[SQ_NORM]]) : (f32) -> f32
+// CHECK: %[[REAL_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[REAL]], %[[ZERO]] : f32
+// CHECK: %[[IMAG_IS_ZERO:.*]] = llvm.fcmp "oeq" %[[IMAG]], %[[ZERO]] : f32
+
+// CHECK: %[[IMAG_DIV_REAL:.*]] = llvm.fdiv %[[IMAG]], %[[REAL]] : f32
+// CHECK: %[[IMAG_SQ:.*]] = llvm.fmul %[[IMAG_DIV_REAL]], %[[IMAG_DIV_REAL]]  : f32
+// CHECK: %[[IMAG_SQ_PLUS_ONE:.*]] = llvm.fadd %[[IMAG_SQ]], %[[ONE]] : f32
+// CHECK: %[[IMAG_SQRT:.*]] = llvm.intr.sqrt(%[[IMAG_SQ_PLUS_ONE]]) : (f32) -> f32
+// CHECK: %[[REAL_ABS:.*]] = llvm.intr.fabs(%[[REAL]]) : (f32) -> f32
+// CHECK: %[[ABS_IMAG:.*]] = llvm.fmul %[[IMAG_SQRT]], %[[REAL_ABS]] : f32
+
+// CHECK: %[[REAL_DIV_IMAG:.*]] = llvm.fdiv %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[REAL_SQ:.*]] = llvm.fmul %[[REAL_DIV_IMAG]], %[[REAL_DIV_IMAG]] : f32
+// CHECK: %[[REAL_SQ_PLUS_ONE:.*]] = llvm.fadd %[[REAL_SQ]], %[[ONE]]  : f32
+// CHECK: %[[REAL_SQRT:.*]] = llvm.intr.sqrt(%[[REAL_SQ_PLUS_ONE]])  : (f32) -> f32
+// CHECK: %[[IMAG_ABS:.*]] = llvm.intr.fabs(%[[IMAG]]) : (f32) -> f32
+// CHECK: %[[ABS_REAL:.*]] = llvm.fmul %[[REAL_SQRT]], %[[IMAG_ABS]]  : f32
+
+// CHECK: %[[REAL_GT_IMAG:.*]] = llvm.fcmp "ogt" %[[REAL]], %[[IMAG]] : f32
+// CHECK: %[[ABS1:.*]] = llvm.select %[[REAL_GT_IMAG]], %[[ABS_IMAG]], %[[ABS_REAL]] : i1, f32
+// CHECK: %[[ABS2:.*]] = llvm.select %[[IMAG_IS_ZERO]], %[[REAL_ABS]], %[[ABS1]] : i1, f32
+// CHECK: %[[NORM:.*]] = llvm.select %[[REAL_IS_ZERO]], %[[IMAG_ABS]], %[[ABS2]] : i1, f32
 // CHECK: llvm.return %[[NORM]] : f32
 
 // CHECK-LABEL: llvm.func @complex_eq
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 09a873fa331d..dbf8ead49f78 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -590,12 +590,12 @@ func.func @mbarrier_txcount() {
     }
       
 
-    %phase = arith.constant 0 : index
+    %phase_c0 = arith.constant 0 : i1
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
     // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
-    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
+    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return 
 }
@@ -626,12 +626,12 @@ func.func @mbarrier_txcount_pred() {
     // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]], {{.*}}, predicate = %[[P]]
     nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount, predicate = %pred : !barrierType
 
-    %phase = arith.constant 0 : index
+    %phase_c0 = arith.constant 0 : i1
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
     // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
-    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
+    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return 
 }
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1c13b16dfd9a..a46f2e101f3c 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2460,3 +2460,40 @@ func.func @make_fixed_vector_of_scalable_vector(%f : f64) -> vector<3x[2]xf64>
   %res = vector.broadcast %f : f64 to vector<3x[2]xf64>
   return %res : vector<3x[2]xf64>
 }
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_0d
+//  CHECK-SAME:     %[[LHS:.*]]: vector<i8>, %[[RHS:.*]]: vector<i8>)
+func.func @vector_interleave_0d(%a: vector<i8>, %b: vector<i8>) -> vector<2xi8> {
+  // CHECK: %[[LHS_RANK1:.*]] = builtin.unrealized_conversion_cast %[[LHS]] : vector<i8> to vector<1xi8>
+  // CHECK: %[[RHS_RANK1:.*]] = builtin.unrealized_conversion_cast %[[RHS]] : vector<i8> to vector<1xi8>
+  // CHECK: %[[ZIP:.*]] = llvm.shufflevector %[[LHS_RANK1]], %[[RHS_RANK1]] [0, 1] : vector<1xi8>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<i8>
+  return %0 : vector<2xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_1d
+//  CHECK-SAME:     %[[LHS:.*]]: vector<8xf32>, %[[RHS:.*]]: vector<8xf32>)
+func.func @vector_interleave_1d(%a: vector<8xf32>, %b: vector<8xf32>) -> vector<16xf32>
+{
+  // CHECK: %[[ZIP:.*]] = llvm.shufflevector %[[LHS]], %[[RHS]] [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8xf32>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<8xf32>
+  return %0 : vector<16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_interleave_1d_scalable
+//  CHECK-SAME:     %[[LHS:.*]]: vector<[4]xi32>, %[[RHS:.*]]: vector<[4]xi32>)
+func.func @vector_interleave_1d_scalable(%a: vector<[4]xi32>, %b: vector<[4]xi32>) -> vector<[8]xi32>
+{
+  // CHECK: %[[ZIP:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
+  // CHECK: return %[[ZIP]]
+  %0 = vector.interleave %a, %b : vector<[4]xi32>
+  return %0 : vector<[8]xi32>
+}
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
new file mode 100644
index 000000000000..07e719798b85
--- /dev/null
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -0,0 +1,30 @@
+// REQUIRES: host-supports-nvptx
+// RUN: mlir-opt %s \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" \
+// RUN:   | FileCheck %s
+
+// RUN: mlir-opt %s \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" \
+// RUN:    --mlir-print-ir-after=convert-gpu-to-nvvm 2>&1 \
+// RUN:  | FileCheck %s --check-prefixes=CHECK-NVVM
+
+// This test checks whether the GPU region is compiled correctly to PTX by 
+// pipeline. It doesn't test IR for GPU side, but it can test Host IR and 
+// generated PTX.
+
+// CHECK-LABEL: llvm.func @test_math(%arg0: f32) {
+func.func @test_math(%arg0 : f32) {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    // CHECK: gpu.launch_func  @test_math_kernel::@test_math_kernel
+    // CHECK: gpu.binary @test_math_kernel  [#gpu.object<#nvvm.target
+    gpu.launch 
+        blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1) 
+        threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
+        // CHECK-NVVM: __nv_expf 
+        %s1 = math.exp %arg0 : f32
+        gpu.printf "%f" %s1 : f32
+        gpu.terminator
+    }
+    return
+}
+\ No newline at end of file
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index ce483ddab22a..0369f45ca6a0 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -472,3 +472,29 @@ gpu.module @module_1 [#nvvm.target<chip = "sm_90", features = "+ptx70", link = [
 
 gpu.module @module_2 [#nvvm.target<chip = "sm_90">, #nvvm.target<chip = "sm_80">, #nvvm.target<chip = "sm_70">] {
 }
+
+// CHECK-LABEL : nvvm.grid_constant
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' attribute must be present only on kernel arguments}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' attribute requires the argument to also have attribute 'llvm.byval'}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{'"nvvm.grid_constant"' must be a unit attribute}}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant = true}) attributes {nvvm.kernel} {
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 5a14df9ef9f8..89e8e7836c3a 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -35,6 +35,18 @@ func.func @rocdl.barrier() {
   llvm.return
 }
 
+func.func @rocdl.sched_barrier() {
+  // CHECK: rocdl.sched.barrier
+  rocdl.sched.barrier 0
+  llvm.return
+}
+
+func.func @rocdl.setprio() {
+  // CHECK: rocdl.s.setprio
+  rocdl.s.setprio 0
+  llvm.return
+}
+
 func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
                    %arg2 : vector<32xf32>, %arg3 : i32,
                    %arg4 : vector<16xf32>, %arg5 : vector<4xf32>,
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 052dc367ca67..721f35162ef8 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1017,3 +1017,15 @@ func.func @canonicalize_fill_to_copy_dest(%arg0 : tensor<?x?xf32>, %arg1 : tenso
   %copy = linalg.copy ins(%arg1 : tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %copy : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @broadcast_same_shape(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<2x3xf32>
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<2x3xf32>)
+//       CHECK-NOT:   linalg.broadcast
+//       CHECK:       return %[[ARG0]] : tensor<2x3xf32>
+func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = linalg.broadcast ins(%input: tensor<2x3xf32>) outs(%init: tensor<2x3xf32>) dimensions = []
+  return %0 : tensor<2x3xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 56890df3f3ee..916c04f33e9c 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -744,3 +744,29 @@ func.func @illegal_softmax_output_shape(%arg0: tensor<2x16x32xf32>) -> tensor<2x
     -> tensor<2x16xf32>
   return %1 : tensor<2x16xf32>
 }
+
+// -----
+
+func.func @mmt4d_dims_mismatch(%A: tensor<16x16x8x1xf32>,
+                               %B: tensor<16x16x8x1xf32>,
+                               %C_in: tensor<16x16x8x1xf32>) -> tensor<16x16x8x1xf32> {
+    // expected-error @+1 {{inferred input/output operand #2 has shape's dimension #3 to be 8, but found 1}}
+    %res = linalg.mmt4d
+                     ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                     outs(%C_in: tensor<16x16x8x1xf32>)
+                     -> tensor<16x16x8x1xf32>
+    return %res : tensor<16x16x8x1xf32>
+}
+
+// -----
+
+func.func @mmt4d_rank_mismatch(%A: tensor<16x16x8x1xf32>,
+                 %B: tensor<16x16x8x1xf32>,
+                 %C_in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+    // expected-error @+1 {{expected operand rank (2) to match the result rank of indexing_map #2 (4)}}
+    %res = linalg.mmt4d
+                     ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                     outs(%C_in: tensor<8x8xf32>)
+                     -> tensor<8x8xf32>
+    return %res : tensor<8x8xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 29977a71dbb8..7064e1b3f9dc 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1219,6 +1219,17 @@ func.func @batchmatmul_transpose_b(%arg0: memref<2x3x5xf32>, %arg1: memref<2x7x5
 
 // -----
 
+// CHECK-LABEL: func @mmt4d
+func.func @mmt4d(%A: tensor<10x32x8x1xf32>, %B: tensor<80x32x4x1xf32>, %C: tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32> {
+  // CHECK: %{{.+}} = linalg.mmt4d
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<10x32x8x1xf32>, tensor<80x32x4x1xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32>
+  %0 = linalg.mmt4d ins(%A, %B : tensor<10x32x8x1xf32>, tensor<80x32x4x1xf32>) outs(%C: tensor<10x80x8x4xf32>) -> tensor<10x80x8x4xf32>
+  return %0: tensor<10x80x8x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @batch_mmt4d
 func.func @batch_mmt4d(%arg0: tensor<128x10x32x8x1xf32>, %arg1: tensor<128x80x32x4x1xf32>, %arg2: tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> {
   // CHECK: %{{.+}} = linalg.batch_mmt4d
diff --git a/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
new file mode 100644
index 000000000000..61e13d1bfa9c
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
+
+func.func @mmt4d_to_fma(%A: tensor<16x16x8x1xf32>, %B: tensor<16x16x8x1xf32>, %C_in: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> {
+  %res = linalg.mmt4d
+                   ins(%A, %B: tensor<16x16x8x1xf32>, tensor<16x16x8x1xf32>)
+                   outs(%C_in: tensor<16x16x8x8xf32>)
+                   -> tensor<16x16x8x8xf32>
+  return %res : tensor<16x16x8x8xf32>
+}
+
+
+// CHECK-LABEL:     @mmt4d_to_fma
+// CHECK-COUNT-8:         vector.fma
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.op<"func.func">
+
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %func
+
+    // Step 1: Tile
+      : (!transform.op<"func.func">) -> !transform.any_op
+    // Tile parallel dims
+    %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 8, 8, 0]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    // Tile reduction dims
+    %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_linalg_op_r : !transform.any_op
+
+    // Step 3: Simplify
+    // vector.multi_reduction --> vector.contract
+    // Generates a 6-dim vector.contract with the dim matching the original MMT4D Op
+    // and with the following split into parallel and reduction dims:
+    //    * parallel, parallel, reduction, parallel, parallel, reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.reduction_to_contract
+      // Reduce the rank of xfer ops. This transforms vector.contract to be
+      // more matmul-like and to enable the lowering to outer product Ops.
+      transform.apply_patterns.vector.transfer_permutation_patterns
+    } : !transform.op<"func.func">
+
+    // Hoisting and LICM - not strictly required
+    %func_h = transform.structured.hoist_redundant_vector_transfers %func
+      : (!transform.op<"func.func">) -> !transform.op<"func.func">
+    %all_loops = transform.structured.match interface{LoopLikeInterface} in %func_h
+      : (!transform.op<"func.func">) -> !transform.any_op
+    transform.apply_licm to %all_loops : !transform.any_op
+    transform.loop.hoist_loop_invariant_subsets %all_loops : !transform.any_op
+
+    // Simplify the 6-dim vector.contract into a 3-dim matmul-like
+    // vector.contract with the following split into parallel and reduction
+    // dims:
+    //    * parallel, parallel, reduction
+    transform.apply_patterns to %func_h {
+      transform.apply_patterns.vector.reduction_to_contract
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+    // Step 4: Lower vector.contract to vector.fma via vector.outerproduct
+    transform.apply_patterns to %func_h {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+      transform.apply_patterns.vector.lower_outerproduct
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
new file mode 100644
index 000000000000..a1a52397386c
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics
+
+func.func @conv1d_nwc_wcf_dyn_ch_dim(%input: memref<4x6x?xf32>, %filter: memref<1x?x8xf32>, %output: memref<4x2x8xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.conv_1d_nwc_wcf
+    {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
+    ins(%input, %filter : memref<4x6x?xf32>, memref<1x?x8xf32>)
+    outs(%output : memref<4x2x8xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
+    outs(%output : memref<3x2x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x3xf32>, %filter: memref<2x3xf32>, %output: memref<3x?x3xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x3xf32>, memref<2x3xf32>)
+    outs(%output : memref<3x?x3xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @conv1d_dyn_w_dim(%input: tensor<?xf32>, %filter: tensor<4xf32>, %output: tensor<?xf32>) -> tensor<?xf32> {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.conv_1d ins(%input, %filter : tensor<?xf32>, tensor<4xf32>)
+                     outs(%output : tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 5d1bef478ee9..0272ac599aa3 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -639,6 +639,31 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
+               outs(%C_in: memref<16x16x8x8xf32>)
+  return
+}
+
+// CHECK-LABEL:   func.func @mmt4d(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @matmul_scalable(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
   linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
             outs(%C: memref<?x?xf32>)
diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
index 29e300a992d3..40acd82cd055 100644
--- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
+++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
@@ -62,7 +62,7 @@ func.func @main() {
     //      CHECK:   nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup>
     //      CHECK: }
     //
-    //      CHECK: %[[c0_6:.*]] = arith.constant 0 : index
+    //      CHECK: %[[c0_6:.*]] = llvm.mlir.constant(false) : i1 
     //      CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index
     //      CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup>
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 812b79e35595..1c1b6ea58e02 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1651,6 +1651,15 @@ func.func @omp_target_enter_data(%map1: memref<?xi32>) {
 
 // -----
 
+func.func @omp_target_enter_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
 func.func @omp_target_exit_data(%map1: memref<?xi32>) {
   %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
   // expected-error @below {{from, release and delete map types are permitted}}
@@ -1660,6 +1669,15 @@ func.func @omp_target_exit_data(%map1: memref<?xi32>) {
 
 // -----
 
+func.func @omp_target_exit_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
 func.func @omp_target_update_invalid_motion_type(%map1 : memref<?xi32>) {
   %mapv = omp.map_info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
 
@@ -1732,9 +1750,91 @@ llvm.mlir.global internal @_QFsubEx() : i32
 
 // -----
 
+func.func @omp_target_update_data_depend(%a: memref<?xi32>) {
+  %0 = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  // expected-error @below {{op expected as many depend values as depend variables}}
+  omp.target_update_data motion_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  return
+}
+
+// -----
+
+func.func @omp_target_depend(%data_var: memref<i32>) {
+  // expected-error @below {{op expected as many depend values as depend variables}}
+    "omp.target"(%data_var) ({
+      "omp.terminator"() : () -> ()
+    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0>} : (memref<i32>) -> ()
+   "func.return"() : () -> ()
+}
+
+// -----
+
 func.func @omp_distribute(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (memref<i32>) -> ()
 }
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  %0 = arith.constant 0.0 : f32
+  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: 'f32'}}
+  omp.yield(%0 : f32)
+}
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: None}}
+  omp.yield
+}
+
+// -----
+
+omp.private {type = private} @x.privatizer : i32 alloc {
+^bb0(%arg0: i32):
+  // expected-error @below {{expected exit block terminator to be an `omp.yield` op.}}
+  omp.terminator
+}
+
+// -----
+
+// expected-error @below {{`alloc`: expected 1 region arguments, got: 2}}
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32, %arg1: f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`copy`: expected 2 region arguments, got: 1}}
+omp.private {type = firstprivate} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} copy {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`private` clauses require only an `alloc` region.}}
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} copy {
+^bb0(%arg0: f32, %arg1 : f32):
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`firstprivate` clauses require both `alloc` and `copy` regions.}}
+omp.private {type = firstprivate} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 65a704d18107..3bb4a288376e 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -517,7 +517,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1:
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -640,11 +640,13 @@ func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
 func.func @parallel_reduction() {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add_f32 -> {{.+}} : !llvm.ptr)
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add_f32 {{.+}} -> {{.+}} : !llvm.ptr)
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction %{{.+}}, %{{.+}}
-    omp.reduction %1, %0 : f32, !llvm.ptr
+    %2 = llvm.load %prv : !llvm.ptr -> f32
+    // CHECK: llvm.fadd %{{.*}}, %{{.*}} : f32
+    %3 = llvm.fadd %1, %2 : f32
+    llvm.store %3, %prv : f32, !llvm.ptr
     omp.terminator
   }
   return
@@ -654,13 +656,14 @@ func.func @parallel_reduction() {
 func.func @parallel_wsloop_reduction(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     // CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
     omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}} : f32, !llvm.ptr
-      omp.reduction %1, %0 : f32, !llvm.ptr
+      %2 = llvm.load %prv : !llvm.ptr -> f32
+      // CHECK: llvm.fadd %{{.+}}, %{{.+}} : f32
+      llvm.fadd %1, %2 : f32
       // CHECK: omp.yield
       omp.yield
     }
@@ -799,11 +802,14 @@ func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
 // CHECK-LABEL: func @parallel_reduction2
 func.func @parallel_reduction2() {
   %0 = memref.alloca() : memref<1xf32>
-  // CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
-  omp.parallel reduction(@add2_f32 -> %0 : memref<1xf32>) {
+  // CHECK: omp.parallel reduction(@add2_f32 %{{.+}} -> %{{.+}} : memref<1xf32>)
+  omp.parallel reduction(@add2_f32 %0 -> %prv : memref<1xf32>) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction
-    omp.reduction %1, %0 : f32, memref<1xf32>
+    %2 = arith.constant 0 : index
+    %3 = memref.load %prv[%2] : memref<1xf32>
+    // CHECK: llvm.fadd
+    %4 = llvm.fadd %1, %3 : f32
+    memref.store %4, %prv[%2] : memref<1xf32>
     omp.terminator
   }
   return
@@ -813,13 +819,14 @@ func.func @parallel_reduction2() {
 func.func @parallel_wsloop_reduction2(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add2_f32 -> %0 : !llvm.ptr) {
+  // CHECK: omp.parallel reduction(@add2_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+  omp.parallel reduction(@add2_f32 %0 -> %prv : !llvm.ptr) {
     // CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
     omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}} : f32, !llvm.ptr
-      omp.reduction %1, %0 : f32, !llvm.ptr
+      %2 = llvm.load %prv : !llvm.ptr -> f32
+      // CHECK: llvm.fadd %{{.+}}, %{{.+}} : f32
+      %3 = llvm.fadd %1, %2 : f32
       // CHECK: omp.yield
       omp.yield
     }
@@ -1710,6 +1717,18 @@ func.func @omp_task_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
   return
 }
 
+
+// CHECK-LABEL: @omp_target_depend
+// CHECK-SAME: (%arg0: memref<i32>, %arg1: memref<i32>) {
+func.func @omp_target_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
+  // CHECK:  omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
+  omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
+    // CHECK: omp.terminator
+    omp.terminator
+  } {operandSegmentSizes = array<i32: 0,0,0,3,0>}
+  return
+}
+
 func.func @omp_threadprivate() {
   %0 = arith.constant 1 : i32
   %1 = arith.constant 2 : i32
@@ -2138,3 +2157,52 @@ func.func @omp_targets_is_allocatable(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
   }
   return
 }
+
+// CHECK-LABEL: func @omp_target_enter_update_exit_data_depend
+// CHECK-SAME:([[ARG0:%.*]]: memref<?xi32>, [[ARG1:%.*]]: memref<?xi32>, [[ARG2:%.*]]: memref<?xi32>) {
+func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memref<?xi32>, %c: memref<?xi32>) {
+// CHECK-NEXT: [[MAP0:%.*]] = omp.map_info
+// CHECK-NEXT: [[MAP1:%.*]] = omp.map_info
+// CHECK-NEXT: [[MAP2:%.*]] = omp.map_info
+  %map_a = omp.map_info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
+  %map_b = omp.map_info var_ptr(%b: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  %map_c = omp.map_info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32>
+
+  // Do some work on the host that writes to 'a'
+  omp.task depend(taskdependout -> %a : memref<?xi32>) {
+    "test.foo"(%a) : (memref<?xi32>) -> ()
+    omp.terminator
+  }
+
+  // Then map that over to the target
+  // CHECK: omp.target_enter_data nowait map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>) depend(taskdependin -> [[ARG0]] : memref<?xi32>)
+  omp.target_enter_data nowait map_entries(%map_a, %map_c: memref<?xi32>, memref<?xi32>) depend(taskdependin ->  %a: memref<?xi32>)
+
+  // Compute 'b' on the target and copy it back
+  // CHECK: omp.target map_entries([[MAP1]] -> {{%.*}} : memref<?xi32>) {
+  omp.target map_entries(%map_b -> %arg0 : memref<?xi32>) {
+    ^bb0(%arg0: memref<?xi32>) :
+      "test.foo"(%arg0) : (memref<?xi32>) -> ()
+      omp.terminator
+  }
+
+  // Update 'a' on the host using 'b'
+  omp.task depend(taskdependout -> %a: memref<?xi32>){
+    "test.bar"(%a, %b) : (memref<?xi32>, memref<?xi32>) -> ()
+  }
+
+  // Copy the updated 'a' onto the target
+  // CHECK: omp.target_update_data nowait motion_entries([[MAP0]] : memref<?xi32>) depend(taskdependin -> [[ARG0]] : memref<?xi32>)
+  omp.target_update_data motion_entries(%map_a :  memref<?xi32>) depend(taskdependin -> %a : memref<?xi32>) nowait
+
+  // Compute 'c' on the target and copy it back
+  %map_c_from = omp.map_info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
+  omp.target map_entries(%map_a -> %arg0, %map_c_from -> %arg1 : memref<?xi32>, memref<?xi32>) depend(taskdependout -> %c : memref<?xi32>) {
+  ^bb0(%arg0 : memref<?xi32>, %arg1 : memref<?xi32>) :
+    "test.foobar"() : ()->()
+    omp.terminator
+  }
+  // CHECK: omp.target_exit_data map_entries([[MAP2]] : memref<?xi32>) depend(taskdependin -> [[ARG2]] : memref<?xi32>)
+  omp.target_exit_data map_entries(%map_c : memref<?xi32>) depend(taskdependin -> %c : memref<?xi32>)
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/roundtrip.mlir b/mlir/test/Dialect/OpenMP/roundtrip.mlir
new file mode 100644
index 000000000000..2553442638ee
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/roundtrip.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt -verify-diagnostics %s | mlir-opt | FileCheck %s
+
+// CHECK: omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}
+
+// CHECK: omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
+omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
+// CHECK: ^bb0(%arg0: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+// CHECK: } copy {
+} copy {
+// CHECK: ^bb0(%arg0: {{.*}}, %arg1: {{.*}}):
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}
+
diff --git a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
new file mode 100644
index 000000000000..8954839c3c93
--- /dev/null
+++ b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
@@ -0,0 +1,130 @@
+// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check -split-input-file  | FileCheck %s
+// RUN: mlir-opt %s -test-wrap-scf-while-loop-in-zero-trip-check='force-create-check=true' -split-input-file  | FileCheck %s --check-prefix FORCE-CREATE-CHECK
+
+func.func @wrap_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %cst5 = arith.constant 5 : i32
+  %res:2 = scf.while (%iter = %cst0) : (i32) -> (i32, i32) {
+    %cond = arith.cmpi slt, %iter, %bound : i32
+    %inv = arith.addi %bound, %cst5 : i32
+    scf.condition(%cond) %iter, %inv : i32, i32
+  } do {
+  ^bb0(%arg1: i32, %arg2: i32):
+    %next = arith.addi %arg1, %arg2 : i32
+    scf.yield %next : i32
+  }
+  return %res#0 : i32
+}
+
+// CHECK-LABEL: func.func @wrap_while_loop_in_zero_trip_check(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi slt, %[[C0]], %[[BOUND]] : i32
+// CHECK-DAG:     %[[PRE_INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
+// CHECK:         %[[IF:.*]]:2 = scf.if %[[PRE_COND]] -> (i32, i32) {
+// CHECK:           %[[WHILE:.*]]:2 = scf.while (
+// CHECK-SAME:          %[[ARG1:.*]] = %[[C0]], %[[ARG2:.*]] = %[[PRE_INV]]
+// CHECK-SAME:      ) : (i32, i32) -> (i32, i32) {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             %[[INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]], %[[INV]] : i32, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32):
+// CHECK:             scf.yield %[[ARG3]], %[[ARG4]] : i32, i32
+// CHECK:           }
+// CHECK:           scf.yield %[[WHILE]]#0, %[[WHILE]]#1 : i32, i32
+// CHECK:         } else {
+// CHECK:           scf.yield %[[C0]], %[[PRE_INV]] : i32, i32
+// CHECK:         }
+// CHECK:         return %[[IF]]#0 : i32
+
+// -----
+
+func.func @wrap_while_loop_with_minimal_before_block(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %true = arith.constant true
+  %cst5 = arith.constant 5 : i32
+  %res = scf.while (%iter = %cst0, %arg0 = %true) : (i32, i1) -> i32 {
+    scf.condition(%arg0) %iter : i32
+  } do {
+  ^bb0(%arg1: i32):
+    %next = arith.addi %arg1, %cst5 : i32
+    %cond = arith.cmpi slt, %next, %bound : i32
+    scf.yield %next, %cond : i32, i1
+  }
+  return %res : i32
+}
+
+// CHECK-LABEL: func.func @wrap_while_loop_with_minimal_before_block(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[TRUE:.*]] = arith.constant true
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK:         %[[IF:.*]] = scf.if %[[TRUE]] -> (i32) {
+// CHECK:           %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// CHECK:             scf.yield %[[ARG2]] : i32
+// CHECK:           }
+// CHECK:           scf.yield %[[WHILE]] : i32
+// CHECK:         } else {
+// CHECK:           scf.yield %[[C0]] : i32
+// CHECK:         }
+// CHECK:         return %[[IF]] : i32
+
+// -----
+
+func.func @wrap_do_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %cst5 = arith.constant 5 : i32
+  %res = scf.while (%iter = %cst0) : (i32) -> i32 {
+    %next = arith.addi %iter, %cst5 : i32
+    %cond = arith.cmpi slt, %next, %bound : i32
+    scf.condition(%cond) %next : i32
+  } do {
+  ^bb0(%arg1: i32):
+    scf.yield %arg1 : i32
+  }
+  return %res : i32
+}
+
+// CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check(
+// CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-NOT:     scf.if
+// CHECK:         %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[C0]]) : (i32) -> i32 {
+// CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// CHECK:             scf.yield %[[ARG2]] : i32
+// CHECK:           }
+// CHECK:         return %[[WHILE]] : i32
+
+// FORCE-CREATE-CHECK-LABEL: func.func @wrap_do_while_loop_in_zero_trip_check(
+// FORCE-CREATE-CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
+// FORCE-CREATE-CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
+// FORCE-CREATE-CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
+// FORCE-CREATE-CHECK:         %[[PRE_NEXT:.*]] = arith.addi %[[C0]], %[[C5]] : i32
+// FORCE-CREATE-CHECK:         %[[PRE_COND:.*]] = arith.cmpi slt, %[[PRE_NEXT]], %[[BOUND]] : i32
+// FORCE-CREATE-CHECK:         %[[IF:.*]] = scf.if %[[PRE_COND]] -> (i32) {
+// FORCE-CREATE-CHECK:           %[[WHILE:.*]] = scf.while (%[[ARG1:.*]] = %[[PRE_NEXT]]) : (i32) -> i32 {
+// FORCE-CREATE-CHECK:             %[[NEXT:.*]] = arith.addi %[[ARG1]], %[[C5]] : i32
+// FORCE-CREATE-CHECK:             %[[COND:.*]] = arith.cmpi slt, %[[NEXT]], %[[BOUND]] : i32
+// FORCE-CREATE-CHECK:             scf.condition(%[[COND]]) %[[NEXT]] : i32
+// FORCE-CREATE-CHECK:           } do {
+// FORCE-CREATE-CHECK:           ^bb0(%[[ARG2:.*]]: i32):
+// FORCE-CREATE-CHECK:             scf.yield %[[ARG2]] : i32
+// FORCE-CREATE-CHECK:           }
+// FORCE-CREATE-CHECK:           scf.yield %[[WHILE]] : i32
+// FORCE-CREATE-CHECK:         } else {
+// FORCE-CREATE-CHECK:           scf.yield %[[PRE_NEXT]] : i32
+// FORCE-CREATE-CHECK:         }
+// FORCE-CREATE-CHECK:         return %[[IF]] : i32
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
index 6fe7ec906f30..8293169049ca 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
@@ -4,7 +4,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
index 2d189cc94c15..a52a46b4fcfe 100644
--- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
@@ -315,3 +315,109 @@ func.func private @BSR(%arg0: tensor<?x?xf64, #BSR>) {
 func.func private @BSR_explicit(%arg0: tensor<?x?xf64, #BSR_explicit>) {
   return
 }
+
+// -----
+
+// expected-error@+6 {{expected structured size to be >= 0}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : dense,
+    j            : dense,
+    k mod 4      : structured[-2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+6 {{expected n <= m in n_out_of_m}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : dense,
+    j            : dense,
+    k mod 4      : structured[5, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected all dense lvlTypes before a n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : compressed,
+    j            : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected n_out_of_m to be the last level type}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 4 : structured[2, 4],
+    j            : dense,
+    k mod 4      : compressed
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected 1xm block structure for n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 2 : dense,
+    j            : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected coeffiencts of Affine expressions to be equal to m of n_out_of_m level}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 2 : dense,
+    j            : dense,
+    k mod 2      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected only one blocked level with the same coefficients}}
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i floordiv 2 : dense,
+    i mod 2      : dense,
+    j            : dense,
+    k floordiv 4 : dense,
+    k mod 4      : structured[2, 4]
+  )
+}>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 20702bb98502..64520638b253 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -207,12 +207,12 @@ func.func private @BSR_explicit(%arg0: tensor<?x?xf64, #BSR_explicit>) {
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   ),
   crdWidth = 8  // we would even like just 2-bits
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : block2_4), crdWidth = 8 }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), crdWidth = 8 }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?xf64, #NV_24>) {
@@ -226,11 +226,11 @@ func.func private @NV_24(%arg0: tensor<?x?xf64, #NV_24>) {
   ( i            : dense,
     j            : dense,
     k floordiv 4 : dense,
-    k mod 4      : block2_4
+    k mod 4      : structured[2, 4]
   )
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : block2_4) }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 floordiv 4 : dense, d2 mod 4 : structured[2, 4]) }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
@@ -244,13 +244,31 @@ func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
   ( i            : dense,
     k floordiv 4 : dense,
     j            : dense,
-    k mod 4      : block2_4
+    k mod 4      : structured[2, 4]
   )
 }>
 
-// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : block2_4) }>
+// CHECK-DAG: #[[$NV_24:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 4 : dense, d1 : dense, d2 mod 4 : structured[2, 4]) }>
 // CHECK-LABEL: func private @NV_24(
 // CHECK-SAME: tensor<?x?x?xf64, #[[$NV_24]]>
 func.func private @NV_24(%arg0: tensor<?x?x?xf64, #NV_24>) {
   return
 }
+
+// -----
+
+#NOutOfM = #sparse_tensor.encoding<{
+  map = ( i, j, k ) ->
+  ( i            : dense,
+    k floordiv 8 : dense,
+    j            : dense,
+    k mod 8      : structured[5, 8]
+  )
+}>
+
+// CHECK-DAG: #[[$NOutOfM:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d2 floordiv 8 : dense, d1 : dense, d2 mod 8 : structured[5, 8]) }>
+// CHECK-LABEL: func private @NOutOfM(
+// CHECK-SAME: tensor<?x?x?xf64, #[[$NOutOfM]]>
+func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
+  return
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index 7c494b2bcfe1..d04fbe8ed5c2 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 100 : index
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 300 : index
-// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 8 : i64
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 131072 : i64
 // CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xi64>
 // CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi64> to memref<?xi64>
 // CHECK:           memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<2xi64>
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 7192a719ceb1..90c715bf2eb2 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -555,6 +555,24 @@ func.func @insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
 
 // -----
 
+// Do not insert a cast for the following example. The new source type wouldn't be "more static" than the old one.
+func.func @insert_slice_canonicalize_encoding(%arg0 : tensor<2x2xf32, "foo">,
+                                              %arg1 : tensor<4x4xf32, "foo">) -> tensor<4x4xf32, "foo">
+{
+  %0 = tensor.insert_slice %arg0 into %arg1[0, 0] [2, 2] [1, 1] : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo">
+  return %0 : tensor<4x4xf32, "foo">
+}
+// CHECK-LABEL: func @insert_slice_canonicalize_encoding
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x2xf32, "foo">
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<4x4xf32, "foo">
+//       CHECK-NOT: tensor.cast
+//       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[ARG1]]
+//  CHECK-SAME:      [0, 0] [2, 2] [1, 1]
+//  CHECK-SAME:      : tensor<2x2xf32, "foo"> into tensor<4x4xf32, "foo">
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
 func.func @slice_to_insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
     %arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 {
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
new file mode 100644
index 000000000000..85e23103eaed
--- /dev/null
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -split-input-file -test-vector-linearize | FileCheck %s
+
+// CHECK-LABEL: test_linearize
+//  CHECK-SAME: (%[[ORIG_ARG:.*]]: vector<2x2xf32>)
+//       CHECK: %[[ARG:.*]] = vector.shape_cast %[[ORIG_ARG]] : vector<2x2xf32> to vector<4xf32>
+func.func @test_linearize(%arg0: vector<2x2xf32>) -> vector<2x2xf32> {
+//       CHECK: %[[C1:.*]] = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : vector<4xf32>
+  %0 = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : vector<2x2xf32>
+//       CHECK: %[[RES:.*]] = vector.shape_cast %[[C1]] : vector<4xf32> to vector<2x2xf32>
+
+// Arith and math ops are handled in generic way, check some of them
+//       CHECK: %{{.*}} =  math.sin %[[ARG]] : vector<4xf32>
+  %1 = math.sin %arg0 : vector<2x2xf32>
+//       CHECK: %{{.*}} = arith.addf %[[ARG]], %[[C1]] : vector<4xf32>
+  %2 = arith.addf %arg0, %0 :  vector<2x2xf32>
+
+//       CHECK: return %[[RES]] : vector<2x2xf32>
+  return %0 : vector<2x2xf32>
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 2f8530e7c171..79a80be4f8b2 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1081,3 +1081,38 @@ func.func @fastmath(%x: vector<42xf32>) -> f32 {
   %min = vector.reduction <minnumf>, %x fastmath<reassoc,nnan,ninf> : vector<42xf32> into f32
   return %min: f32
 }
+
+// CHECK-LABEL: @interleave_0d
+func.func @interleave_0d(%a: vector<f32>, %b: vector<f32>) -> vector<2xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<f32>
+  %0 = vector.interleave %a, %b : vector<f32>
+  return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: @interleave_1d
+func.func @interleave_1d(%a: vector<4xf32>, %b: vector<4xf32>) -> vector<8xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<4xf32>
+  %0 = vector.interleave %a, %b : vector<4xf32>
+  return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: @interleave_1d_scalable
+func.func @interleave_1d_scalable(%a: vector<[8]xi16>, %b: vector<[8]xi16>) -> vector<[16]xi16> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<[8]xi16>
+  %0 = vector.interleave %a, %b : vector<[8]xi16>
+  return %0 : vector<[16]xi16>
+}
+
+// CHECK-LABEL: @interleave_2d
+func.func @interleave_2d(%a: vector<2x8xf32>, %b: vector<2x8xf32>) -> vector<2x16xf32> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<2x8xf32>
+  %0 = vector.interleave %a, %b : vector<2x8xf32>
+  return %0 : vector<2x16xf32>
+}
+
+// CHECK-LABEL: @interleave_2d_scalable
+func.func @interleave_2d_scalable(%a: vector<2x[2]xf64>, %b: vector<2x[2]xf64>) -> vector<2x[4]xf64> {
+  // CHECK: vector.interleave %{{.*}}, %{{.*}} : vector<2x[2]xf64>
+  %0 = vector.interleave %a, %b : vector<2x[2]xf64>
+  return %0 : vector<2x[4]xf64>
+}
diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
index f601be041681..af6e636245b0 100644
--- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
@@ -206,6 +206,16 @@ func.func @cast_away_extract_strided_slice_leading_one_dims(%arg0: vector<1x8x8x
   return %0: vector<1x1x8xf16>
 }
 
+// CHECK-LABEL: func @cast_away_extract_strided_slice_leading_one_dims_scalable
+func.func @cast_away_extract_strided_slice_leading_one_dims_scalable(%arg0: vector<1x8x[8]xf16>) -> vector<1x1x[8]xf16> {
+  // CHECK:     %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16>
+  // CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[SRC]] {offsets = [4], sizes = [1], strides = [1]} : vector<8x[8]xf16> to vector<1x[8]xf16>
+  %0 = vector.extract_strided_slice %arg0 {offsets = [0, 4], sizes = [1, 1], strides = [1, 1]} : vector<1x8x[8]xf16> to vector<1x1x[8]xf16>
+  // CHECK:     %[[RET:.+]] = vector.broadcast %[[EXTRACT]] : vector<1x[8]xf16> to vector<1x1x[8]xf16>
+  // CHECK: return %[[RET]]
+  return %0: vector<1x1x[8]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims
 func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16>, %arg1: vector<1x8x8xf16>) -> vector<1x8x8xf16> {
   // CHECK:    %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<8xf16> from vector<1x8xf16>
@@ -217,6 +227,17 @@ func.func @cast_away_insert_strided_slice_leading_one_dims(%arg0: vector<1x8xf16
   return %0: vector<1x8x8xf16>
 }
 
+// CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_scalable
+func.func @cast_away_insert_strided_slice_leading_one_dims_scalable(%arg0: vector<1x[8]xf16>, %arg1: vector<1x8x[8]xf16>) -> vector<1x8x[8]xf16> {
+  // CHECK:    %[[SRC:.+]] = vector.extract %{{.*}}[0] : vector<[8]xf16> from vector<1x[8]xf16>
+  // CHECK:    %[[DST:.+]] = vector.extract %{{.*}}[0] : vector<8x[8]xf16> from vector<1x8x[8]xf16>
+  // CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[SRC]], %[[DST]] {offsets = [0, 0], strides = [1]} : vector<[8]xf16> into vector<8x[8]xf16>
+  %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0, 0], strides = [1, 1]} : vector<1x[8]xf16> into vector<1x8x[8]xf16>
+  // CHECK:    %[[RET:.+]] = vector.broadcast %[[INSERT]] : vector<8x[8]xf16> to vector<1x8x[8]xf16>
+  // CHECK: return %[[RET]]
+  return %0: vector<1x8x[8]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_one_element
 //  CHECK-SAME: %[[ARG0:.+]]: vector<1x1xf16>, %{{.+}}: vector<1x1x1xf16>
 func.func @cast_away_insert_strided_slice_leading_one_dims_one_element(%arg0: vector<1x1xf16>, %arg1: vector<1x1x1xf16>) -> vector<1x1x1xf16> {
@@ -227,6 +248,16 @@ func.func @cast_away_insert_strided_slice_leading_one_dims_one_element(%arg0: ve
   return %0: vector<1x1x1xf16>
 }
 
+// CHECK-LABEL: func @cast_away_insert_strided_slice_leading_one_dims_one_element_scalable
+//  CHECK-SAME: %[[ARG0:.+]]: vector<1x[1]xf16>, %{{.+}}: vector<1x1x[1]xf16>
+func.func @cast_away_insert_strided_slice_leading_one_dims_one_element_scalable(%arg0: vector<1x[1]xf16>, %arg1: vector<1x1x[1]xf16>) -> vector<1x1x[1]xf16> {
+  // CHECK: %[[EXT:.+]] = vector.extract %{{.*}}[0] : vector<[1]xf16> from vector<1x[1]xf16>
+  // CHECK: %[[B:.+]] = vector.broadcast %[[EXT]] : vector<[1]xf16> to vector<1x1x[1]xf16>
+  %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0, 0], strides = [1, 1]} : vector<1x[1]xf16> into vector<1x1x[1]xf16>
+  // CHECK: return %[[B]]
+  return %0: vector<1x1x[1]xf16>
+}
+
 // CHECK-LABEL: func @cast_away_transfer_read_leading_one_dims
 func.func @cast_away_transfer_read_leading_one_dims(%arg0: memref<1x4x8x16xf16>) -> vector<1x4xf16> {
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
diff --git a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
index 69b10aed49e3..2264ade7f9b7 100644
--- a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
+++ b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir
@@ -1,8 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" \
 // RUN:   --split-input-file --verify-diagnostics
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -12,20 +11,22 @@
 //
 // ****************************************************************************
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     // expected-note @below {{handle to invalidated ops}}
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // The actual tiling transformation takes tile sizes as attributes.
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      // expected-note @below {{handle to invalidated ops}}
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // The actual tiling transformation takes tile sizes as attributes.
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+        : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
-  // This is trying to use an invalidated handle leading to undefined behavior.
-  // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
-  transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
-  transform.yield
+    // This is trying to use an invalidated handle leading to undefined behavior.
+    // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
+    transform.debug.emit_remark_at %arg1, "remark" : !transform.op<"linalg.matmul">
+    transform.yield
+  }
 }
 
 // Original function to optimize.
@@ -52,27 +53,29 @@ func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>,
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // We can cast one type to another as long as operations are compatible
-  // with both types. This creates "aliasing" handles.
-  // expected-note @below {{handle to invalidated ops}}
-  %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to
-      !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // We can cast one type to another as long as operations are compatible
+    // with both types. This creates "aliasing" handles.
+    // expected-note @below {{handle to invalidated ops}}
+    %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to
+        !transform.any_op
 
-  // The actual tiling transformation takes tile sizes as attributes.
-  // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
-  %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
-    : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
+    // The actual tiling transformation takes tile sizes as attributes.
+    // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}}
+    %tiled, %loop = transform.structured.tile_using_forall %arg1 tile_sizes [4, 32]
+      : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op)
 
-  // Consuming an operand invalidates the consumed handle and any other handle that is
-  // associated with the same payload operations, or payload operations nested in them.
-  // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
-  transform.debug.emit_remark_at %casted, "remark"
-    : !transform.any_op
-  transform.yield
+    // Consuming an operand invalidates the consumed handle and any other handle that is
+    // associated with the same payload operations, or payload operations nested in them.
+    // expected-error @below {{uses a handle invalidated by a previously executed transform op}}
+    transform.debug.emit_remark_at %casted, "remark"
+      : !transform.any_op
+    transform.yield
+  }
 }
 
 // Original function to optimize.
diff --git a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
index c4a2f1eea46c..0a84a5cb6c68 100644
--- a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
+++ b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir
@@ -1,10 +1,8 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" \
 // RUN:   --split-input-file --verify-diagnostics
-
 // ****************************** IMPORTANT NOTE ******************************
 //
 // If you are changing this file, you may also need to change
@@ -45,10 +43,11 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
   // Since the %arg2 handle is associated with both elementwise operations,
   // we need to split it into two handles so we can target only the second
   // elementwise operation.
@@ -99,4 +98,5 @@ transform.sequence failures(propagate) {
   transform.debug.emit_remark_at %f, "fused" : !transform.any_op
 
   transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch1/sequence.mlir b/mlir/test/Examples/transform/Ch1/sequence.mlir
index 5de6e6e096f4..3107adccf78f 100644
--- a/mlir/test/Examples/transform/Ch1/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch1/sequence.mlir
@@ -1,8 +1,7 @@
 // RUN: mlir-opt %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -60,52 +59,54 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+  
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+  
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch2/invalid.mlir b/mlir/test/Examples/transform/Ch2/invalid.mlir
index ad536832d9c5..cb6738974fe2 100644
--- a/mlir/test/Examples/transform/Ch2/invalid.mlir
+++ b/mlir/test/Examples/transform/Ch2/invalid.mlir
@@ -1,11 +1,11 @@
-// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics
+// RUN: transform-opt-ch2 %s --transform-interpreter --split-input-file \
+// RUN:                      --verify-diagnostics
 
 // expected-note @below {{offending payload}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{only applies to func.call payloads}}
     transform.my.change_call_target %arg0, "updated" : !transform.any_op
-    yield
+    transform.yield
   }
 }
diff --git a/mlir/test/Examples/transform/Ch2/ops.mlir b/mlir/test/Examples/transform/Ch2/ops.mlir
index d66f89b9ec8d..410a6e39a480 100644
--- a/mlir/test/Examples/transform/Ch2/ops.mlir
+++ b/mlir/test/Examples/transform/Ch2/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter | FileCheck %s
+// RUN: transform-opt-ch2 %s --transform-interpreter | FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
 //
@@ -17,10 +17,11 @@ func.func @test() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op
-  transform.my.change_call_target %call, "updated" : !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op
+    transform.my.change_call_target %call, "updated" : !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch2/sequence.mlir b/mlir/test/Examples/transform/Ch2/sequence.mlir
index b6f32dc321ef..976df1d55503 100644
--- a/mlir/test/Examples/transform/Ch2/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch2/sequence.mlir
@@ -1,8 +1,7 @@
 // RUN: transform-opt-ch2 %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -56,55 +55,57 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op,
+      %arg1: !transform.op<"linalg.matmul">,
+      %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
 
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.any_op
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.any_op
 
-  transform.yield
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch3/invalid.mlir b/mlir/test/Examples/transform/Ch3/invalid.mlir
index 222629504fea..acaabd5db30e 100644
--- a/mlir/test/Examples/transform/Ch3/invalid.mlir
+++ b/mlir/test/Examples/transform/Ch3/invalid.mlir
@@ -1,10 +1,10 @@
-// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics
+// RUN: transform-opt-ch3 %s --transform-interpreter --split-input-file --verify-diagnostics
 
 // expected-note @below {{offending operation}}
-module {
-  transform.sequence failures(suppress) {
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
   // expected-error @below {{expected the payload operation to implement CallOpInterface}}
-  ^bb0(%arg0: !transform.my.call_op_interface):
-    yield
+  %arg0: !transform.my.call_op_interface) {
+    transform.yield
   }
 }
diff --git a/mlir/test/Examples/transform/Ch3/ops.mlir b/mlir/test/Examples/transform/Ch3/ops.mlir
index f4170b8918bf..b2d47cc369a5 100644
--- a/mlir/test/Examples/transform/Ch3/ops.mlir
+++ b/mlir/test/Examples/transform/Ch3/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter \
+// RUN: transform-opt-ch3 %s --transform-interpreter \
 // RUN:   --allow-unregistered-dialect --split-input-file | FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -18,12 +18,13 @@ func.func @test1() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call">
-  // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call">
-  transform.my.change_call_target %call, "updated" : !transform.op<"func.call">
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call">
+    // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call">
+    transform.my.change_call_target %call, "updated" : !transform.op<"func.call">
+    transform.yield
+  }
 }
 
 // -----
@@ -37,10 +38,11 @@ func.func @test2() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface
-  // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op
-  transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface
+    // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op
+    transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/Ch3/sequence.mlir b/mlir/test/Examples/transform/Ch3/sequence.mlir
index 9dd46b347c5b..8dc33c3560c2 100644
--- a/mlir/test/Examples/transform/Ch3/sequence.mlir
+++ b/mlir/test/Examples/transform/Ch3/sequence.mlir
@@ -1,8 +1,7 @@
-// RUN: transform-opt-ch2 %s \
-// RUN:   --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \
-// RUN:        bind-first-extra-to-ops=linalg.matmul \
-// RUN:        bind-second-extra-to-ops=linalg.elemwise_binary \
-// RUN:        enable-expensive-checks},canonicalize,cse,symbol-dce)" |\
+// RUN: transform-opt-ch3 %s \
+// RUN:   --pass-pipeline="builtin.module(transform-interpreter{ \
+// RUN:        debug-bind-trailing-args=linalg.matmul,linalg.elemwise_binary},\
+// RUN:        canonicalize,cse,symbol-dce)" |\
 // RUN: FileCheck %s
 
 // ****************************** IMPORTANT NOTE ******************************
@@ -56,55 +55,57 @@ func.func private @microkernel(
     %init: tensor<4x4xf32>,
     %output: tensor<4x4xf32>) -> tensor<4x4xf32>
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op,
-     %arg1: !transform.op<"linalg.matmul">,
-     %arg2: !transform.op<"linalg.elemwise_binary">):
-  // Since the %arg2 handle is associated with both elementwise operations,
-  // we need to split it into two handles so we can target only the second
-  // elementwise operation.
-  %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
-      -> (!transform.any_op, !transform.any_op)
-
-  // The actual tiling transformation takes tile sizes as attributes. It produces a
-  // handle to the loop generated during tiling.
-  %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // We can now fuse the other operations into the loop. Here, we fuse
-  // operations one-by-one. This requires the operation that is being fused
-  // to define the value used within the loop, so the order of such fusions
-  // is important. We could also use "transform.merge_handles" to obtain
-  // a single handle to all operations and give it to `fuse_into_containing_op`
-  // that would take care of the ordering in this case.
-  %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
-      : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Tile again to get the desired size. Note that this time this tiles the
-  // "add" operation and fuses matmul into the loop, but doesn't affect the
-  // "max" operation. This illustrates the precise targeting with the transform
-  // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
-  // of which having the same kind.
-  %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul_fused_2, %loop_second_2 =
-      transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Since outlining is currently only implemented for region-holding operations
-  // such as loops, use tiling to size 1 to materialize the outer loop that is
-  // going to be outlined.
-  %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
-      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
-      : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
-
-  // Rewrite the call target.
-  transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call">
-
-  transform.yield
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op,
+       %arg1: !transform.op<"linalg.matmul">,
+       %arg2: !transform.op<"linalg.elemwise_binary">) {
+    // Since the %arg2 handle is associated with both elementwise operations,
+    // we need to split it into two handles so we can target only the second
+    // elementwise operation.
+    %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">)
+        -> (!transform.any_op, !transform.any_op)
+  
+    // The actual tiling transformation takes tile sizes as attributes. It produces a
+    // handle to the loop generated during tiling.
+    %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // We can now fuse the other operations into the loop. Here, we fuse
+    // operations one-by-one. This requires the operation that is being fused
+    // to define the value used within the loop, so the order of such fusions
+    // is important. We could also use "transform.merge_handles" to obtain
+    // a single handle to all operations and give it to `fuse_into_containing_op`
+    // that would take care of the ordering in this case.
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2
+        : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Tile again to get the desired size. Note that this time this tiles the
+    // "add" operation and fuses matmul into the loop, but doesn't affect the
+    // "max" operation. This illustrates the precise targeting with the transform
+    // dialect. Otherwise, it is difficult to differentiate "add" and "max", both
+    // of which having the same kind.
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused_2, %loop_second_2 =
+        transform.structured.fuse_into_containing_op %matmul_fused into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+  
+    // Since outlining is currently only implemented for region-holding operations
+    // such as loops, use tiling to size 1 to materialize the outer loop that is
+    // going to be outlined.
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %func, %call = transform.loop.outline %outline_target {func_name = "outlined"}
+        : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">)
+  
+    // Rewrite the call target.
+    transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call">
+  
+    transform.yield
+  }
 }
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index d90d740b4453..f8d910370bc2 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter \
+// RUN: mlir-opt %s --transform-interpreter \
 // RUN:             --test-transform-dialect-erase-schedule \
 // RUN:             --math-uplift-to-fma \
 // RUN:             --convert-bufferization-to-memref \
@@ -115,9 +115,9 @@ module attributes { transform.with_named_sequence } {
   // have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82.
   // Also note that the order of dimensions in Halide is inverted, e.g., co and
   // n are the outermost loops in the respective reorder directives.
-  transform.sequence failures(propagate) {
+  transform.named_sequence @__transform_main(
   // This argument will point to the top-level module.
-  ^bb0(%arg0: !transform.any_op):
+      %arg0: !transform.any_op) {
 
     // 1. Find the operations we are going to transform usnig their names. This
     // is a simplistic approach that works when there are few operations in the
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 349b92a7aefa..a42ed6968d37 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -106,6 +106,27 @@ func.func @angle(%arg: complex<f32>) -> f32 {
   func.return %angle : f32
 }
 
+func.func @test_element_f64(%input: tensor<?xcomplex<f64>>,
+                      %func: (complex<f64>) -> f64) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %size = tensor.dim %input, %c0: tensor<?xcomplex<f64>>
+
+  scf.for %i = %c0 to %size step %c1 {
+    %elem = tensor.extract %input[%i]: tensor<?xcomplex<f64>>
+
+    %val = func.call_indirect %func(%elem) : (complex<f64>) -> f64
+    vector.print %val : f64
+    scf.yield
+  }
+  func.return
+}
+
+func.func @abs(%arg: complex<f64>) -> f64 {
+  %abs = complex.abs %arg : complex<f64>
+  func.return %abs : f64
+}
+
 func.func @entry() {
   // complex.sqrt test
   %sqrt_test = arith.constant dense<[
@@ -300,5 +321,38 @@ func.func @entry() {
   call @test_element(%angle_test_cast, %angle_func)
     : (tensor<?xcomplex<f32>>, (complex<f32>) -> f32) -> ()
 
+  // complex.abs test
+  %abs_test = arith.constant dense<[
+    (1.0, 1.0),
+    // CHECK:  1.414
+    (1.0e300, 1.0e300),
+    // CHECK-NEXT:  1.41421e+300
+    (1.0e-300, 1.0e-300),
+    // CHECK-NEXT:  1.41421e-300
+    (5.0, 0.0),
+    // CHECK-NEXT:  5
+    (0.0, 6.0),
+    // CHECK-NEXT:  6
+    (7.0, 8.0),
+    // CHECK-NEXT:  10.6301
+    (-1.0, -1.0),
+    // CHECK-NEXT: 1.414
+    (-1.0e300, -1.0e300),
+    // CHECK-NEXT:  1.41421e+300
+    (-1.0, 0.0),
+    // CHECK-NOT: -1
+    // CHECK-NEXT:  1
+    (0.0, -1.0)
+    // CHECK-NOT:  -1
+    // CHECK-NEXT:  1
+  ]> : tensor<10xcomplex<f64>>
+  %abs_test_cast = tensor.cast %abs_test
+    :  tensor<10xcomplex<f64>> to tensor<?xcomplex<f64>>
+
+  %abs_func = func.constant @abs : (complex<f64>) -> f64
+
+  call @test_element_f64(%abs_test_cast, %abs_func)
+    : (tensor<?xcomplex<f64>>, (complex<f64>) -> f64) -> ()
+
   func.return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index 327f237ba894..d5c35068ccb3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -73,14 +73,14 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
       : (!transform.any_op) -> !transform.any_op
 
-    // Step 1: Tile for size [8] x [8], which corresponds to (2 x SVLs) x (2 x SVLs),
-    // where SVLs is the number of 32-bit elements in a vector of SVL bits.
-    // This uses all four 32-bit SME virtual tiles.
-    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 1]
+    // Step 1: Tile for size [8] x [8] (unrolled by 4), which corresponds to
+    // (2 x SVLs) x (2 x SVLs), where SVLs is the number of 32-bit elements in a
+    // vector of SVL bits. This uses all four 32-bit SME virtual tiles.
+    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 4]
       : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
 
     // Step 2: Vectorize.
-    transform.structured.vectorize %tiled_linalg_op vector_sizes [[8], [8], 1]
+    transform.structured.vectorize %tiled_linalg_op vector_sizes [[8], [8], 4]
       : !transform.any_op
 
     // Step 3: Bufferize ahead of TransferReadDropUnitDimsPattern, which
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index 17393412badf..22cf15d4f640 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -98,7 +98,6 @@ module attributes {transform.with_named_sequence} {
     // Step 1: Tile
     %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    transform.print %tiled_matmul {name = "matmul lal"}: !transform.any_op
 
     // Step 2: Vectorize
     transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
index 4bc080fc538f..e47ac46597b7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
@@ -59,7 +59,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   ),
 }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
index df5b48a3b6ec..251944c657cb 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -41,7 +41,14 @@
 #NV_24 = #sparse_tensor.encoding<{
   map = ( i, j ) -> ( i            : dense,
                       j floordiv 4 : dense,
-                      j mod 4      : block2_4),
+                      j mod 4      : structured[2, 4]),
+  crdWidth = 8
+}>
+
+#NV_58 = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i            : dense,
+                      j floordiv 8 : dense,
+                      j mod 8      : structured[5, 8]),
   crdWidth = 8
 }>
 
@@ -65,6 +72,7 @@ module {
     %A1 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR>
     %A2 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #CSR_hi>
     %A3 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #NV_24>
+    %A4 = sparse_tensor.new %fileName : !Filename to tensor<?x?xf64, #NV_58>
 
     //
     // CSR:
@@ -113,10 +121,24 @@ module {
     %vecv3 = vector.transfer_read %val3[%c0], %f0 : memref<?xf64>, vector<12xf64>
     vector.print %vecv3 : vector<12xf64>
 
+    //
+    // NV_58
+    //
+    // CHECK-NEXT: ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
+    //
+    %crd4 = sparse_tensor.coordinates %A4 {level = 2 : index } : tensor<?x?xf64, #NV_58> to memref<?xi8>
+    %vecc4 = vector.transfer_read %crd4[%c0], %u0 : memref<?xi8>, vector<12xi8>
+    vector.print %vecc4 : vector<12xi8>
+    %val4 = sparse_tensor.values %A4 : tensor<?x?xf64, #NV_58> to memref<?xf64>
+    %vecv4 = vector.transfer_read %val4[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecv4 : vector<12xf64>
+
     // Release the resources.
     bufferization.dealloc_tensor %A1: tensor<?x?xf64, #CSR>
     bufferization.dealloc_tensor %A2: tensor<?x?xf64, #CSR_hi>
     bufferization.dealloc_tensor %A3: tensor<?x?xf64, #NV_24>
+    bufferization.dealloc_tensor %A4: tensor<?x?xf64, #NV_58>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 17b50b46d073..b0f63f12c2d5 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -20,7 +20,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
index eb99a027a886..311cb607b429 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir
@@ -20,7 +20,7 @@
   map = ( i, j ) ->
   ( i            : dense,
     j floordiv 4 : dense,
-    j mod 4      : block2_4
+    j mod 4      : structured[2, 4]
   )
 }>
 
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
index 35ca0ee8677c..51bcf459d83d 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
@@ -197,7 +197,8 @@ func.func @main() {
     {
       %ticks = arith.constant 10000000 : index
       // TMA wait
-      nvgpu.mbarrier.try_wait.parity %barrier[%i], %c0, %ticks : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType
       %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3>
       %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3>
       // Descriptor WGMMA
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
index 5a10bbba26d8..85bdb38d67f0 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
@@ -206,7 +206,8 @@ func.func @main() {
     {
       %ticks = arith.constant 10000000 : index
       // TMA wait
-      nvgpu.mbarrier.try_wait.parity %barrier[%i], %c0, %ticks : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %barrier[%i], %phase_c0, %ticks : !barrierType
       %lhsSlice = memref.subview %lhsShmem [%i, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: ?>, 3>
       %rhsSlice = memref.subview %rhsShmem [%i, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: ?>, 3>
       // Descriptor WGMMA
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index 9c5aacf96b0d..b50772f8249f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -93,7 +93,8 @@ module @mymod {
       }
 
       // Step 8. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : !barrierType
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 536e71d260f5..65e5fc0aff6a 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -119,7 +119,8 @@ module @mymod {
       }
 
       // Step 7. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : !barrierType
 
       // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index aee265e3faf1..2e59b7234e53 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -96,7 +96,8 @@ module @mymod {
       } else {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
       }
-      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+      %phase_c0 = arith.constant 0 : i1
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir
index b63d3f065156..c7cd1b74ccdb 100644
--- a/mlir/test/Target/LLVMIR/arm-sve.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sve.mlir
@@ -314,3 +314,45 @@ llvm.func @arm_sve_convert_to_svbool(
     : (vector<[1]xi1>) -> vector<[16]xi1>
   llvm.return
 }
+
+// CHECK-LABEL: arm_sve_zip_x2(
+// CHECK-SAME:                 <vscale x 16 x i8> %[[V1:[0-9]+]],
+// CHECK-SAME:                 <vscale x 8 x i16> %[[V2:[0-9]+]],
+// CHECK-SAME:                 <vscale x 4 x i32> %[[V3:[0-9]+]],
+// CHECK-SAME:                 <vscale x 2 x i64> %[[V4:[0-9]+]])
+llvm.func @arm_sve_zip_x2(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) {
+  // CHECK: call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x2.nxv16i8(<vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]])
+  %0 = "arm_sve.intr.zip.x2"(%nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>)
+    -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>)>
+  // CHECK: call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x2.nxv8i16(<vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]])
+  %1 = "arm_sve.intr.zip.x2"(%nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>)
+    -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>)>
+  // CHECK: call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x2.nxv4i32(<vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]])
+  %2 = "arm_sve.intr.zip.x2"(%nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>)
+    -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>)>
+  // CHECK: call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x2.nxv2i64(<vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]])
+  %3 = "arm_sve.intr.zip.x2"(%nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>)
+     -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>)>
+  llvm.return
+}
+
+// CHECK-LABEL: arm_sve_zip_x4(
+// CHECK-SAME:                 <vscale x 16 x i8> %[[V1:[0-9]+]],
+// CHECK-SAME:                 <vscale x 8 x i16> %[[V2:[0-9]+]],
+// CHECK-SAME:                 <vscale x 4 x i32> %[[V3:[0-9]+]],
+// CHECK-SAME:                 <vscale x 2 x i64> %[[V4:[0-9]+]])
+llvm.func @arm_sve_zip_x4(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, %nxv4i32: vector<[4]xi32>, %nxv2i64: vector<[2]xi64>) {
+  // CHECK: call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.zip.x4.nxv16i8(<vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]], <vscale x 16 x i8> %[[V1]])
+  %0 = "arm_sve.intr.zip.x4"(%nxv16i8, %nxv16i8, %nxv16i8, %nxv16i8) : (vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>)
+    -> !llvm.struct<(vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>, vector<[16]xi8>)>
+  // CHECK: call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.zip.x4.nxv8i16(<vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]], <vscale x 8 x i16> %[[V2]])
+  %1 = "arm_sve.intr.zip.x4"(%nxv8i16, %nxv8i16, %nxv8i16, %nxv8i16) : (vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>)
+    -> !llvm.struct<(vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>, vector<[8]xi16>)>
+  // CHECK: call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.zip.x4.nxv4i32(<vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]], <vscale x 4 x i32> %[[V3]])
+  %2 = "arm_sve.intr.zip.x4"(%nxv4i32, %nxv4i32, %nxv4i32, %nxv4i32) : (vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>)
+    -> !llvm.struct<(vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>, vector<[4]xi32>)>
+  // CHECK: call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.zip.x4.nxv2i64(<vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]], <vscale x 2 x i64> %[[V4]])
+  %3 = "arm_sve.intr.zip.x4"(%nxv2i64, %nxv2i64, %nxv2i64, %nxv2i64) : (vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>)
+     -> !llvm.struct<(vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>, vector<[2]xi64>)>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 49f9426daabc..3a6a4544e20d 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -538,3 +538,20 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 3, 4
   llvm.return
 }
 
+// -----
+// CHECK: !nvvm.annotations =
+// CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
+// CHECK: !2 = !{i32 1}
+// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
+
+// -----
+// CHECK: !nvvm.annotations =
+// CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
+// CHECK: !2 = !{i32 1, i32 3}
+// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
+llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index 93ab578df9e4..dae83c0cf92e 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -441,9 +441,11 @@ atomic {
 llvm.func @simple_reduction_parallel() {
   %c1 = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
     %1 = llvm.mlir.constant(2.0 : f32) : f32
-    omp.reduction %1, %0 : f32, !llvm.ptr
+    %2 = llvm.load %prv : !llvm.ptr -> f32
+    %3 = llvm.fadd %2, %1 : f32
+    llvm.store %3, %prv : f32, !llvm.ptr
     omp.terminator
   }
   llvm.return
@@ -512,10 +514,12 @@ llvm.func @parallel_nested_workshare_reduction(%ub : i64) {
   %lb = llvm.mlir.constant(1 : i64) : i64
   %step = llvm.mlir.constant(1 : i64) : i64
   
-  omp.parallel reduction(@add_i32 -> %0 : !llvm.ptr) {
+  omp.parallel reduction(@add_i32 %0 -> %prv : !llvm.ptr) {
     omp.wsloop for (%iv) : i64 = (%lb) to (%ub) step (%step) {
       %ival = llvm.trunc %iv : i64 to i32
-      omp.reduction %ival, %0 : i32, !llvm.ptr
+      %lprv = llvm.load %prv : !llvm.ptr -> i32
+      %add = llvm.add %lprv, %ival : i32
+      llvm.store %add, %prv : i32, !llvm.ptr
       omp.yield
     }
     omp.terminator
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 26123300d748..06b78650c8d0 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -90,6 +90,22 @@ llvm.func @rocdl.barrier() {
   llvm.return
 }
 
+llvm.func @rocdl.setprio() {
+  // CHECK: call void @llvm.amdgcn.s.setprio(i16 0)
+  rocdl.s.setprio 0
+  // CHECK-NEXT: call void @llvm.amdgcn.s.setprio(i16 1)
+  rocdl.s.setprio 1
+  llvm.return
+}
+
+llvm.func @rocdl.schedbarrier() {
+  // CHECK: call void @llvm.amdgcn.sched.barrier(i32 0)
+  rocdl.sched.barrier 0
+  // CHECK-NEXT: call void @llvm.amdgcn.sched.barrier(i32 1)
+  rocdl.sched.barrier 1
+  llvm.return
+}
+
 llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
                    %arg2 : vector<32 x f32>, %arg3: i32,
                    %arg4 : vector<16 x f32>, %arg5 : vector<4xf32>,
diff --git a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
index fedfbe350a51..2991a3c165ee 100644
--- a/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
+++ b/mlir/test/lib/Dialect/Bufferization/TestTensorCopyInsertion.cpp
@@ -44,8 +44,10 @@ struct TestTensorCopyInsertionPass
     bufferization::OneShotBufferizationOptions options;
     options.allowReturnAllocsFromLoops = allowReturnAllocsFromLoops;
     options.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
-    if (mustInferMemorySpace)
-      options.defaultMemorySpace = std::nullopt;
+    if (mustInferMemorySpace) {
+      options.defaultMemorySpaceFn =
+          [](TensorType t) -> std::optional<Attribute> { return std::nullopt; };
+    }
     if (failed(bufferization::insertTensorCopies(getOperation(), options)))
       signalPassFailure();
   }
diff --git a/mlir/test/lib/Dialect/SCF/CMakeLists.txt b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
index 22c2f2388de6..d93bd5591518 100644
--- a/mlir/test/lib/Dialect/SCF/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/SCF/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_library(MLIRSCFTestPasses
   TestLoopParametricTiling.cpp
   TestLoopUnrolling.cpp
   TestSCFUtils.cpp
+  TestSCFWrapInZeroTripCheck.cpp
   TestWhileOpBuilder.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
new file mode 100644
index 000000000000..10206dd7cedf
--- /dev/null
+++ b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
@@ -0,0 +1,72 @@
+//===- TestWrapInZeroTripCheck.cpp -- Passes to test SCF zero-trip-check --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the passes to test wrap-in-zero-trip-check transforms on
+// SCF loop ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestWrapWhileLoopInZeroTripCheckPass
+    : public PassWrapper<TestWrapWhileLoopInZeroTripCheckPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestWrapWhileLoopInZeroTripCheckPass)
+
+  StringRef getArgument() const final {
+    return "test-wrap-scf-while-loop-in-zero-trip-check";
+  }
+
+  StringRef getDescription() const final {
+    return "test scf::wrapWhileLoopInZeroTripCheck";
+  }
+
+  TestWrapWhileLoopInZeroTripCheckPass() = default;
+  TestWrapWhileLoopInZeroTripCheckPass(
+      const TestWrapWhileLoopInZeroTripCheckPass &) {}
+  explicit TestWrapWhileLoopInZeroTripCheckPass(bool forceCreateCheckParam) {
+    forceCreateCheck = forceCreateCheckParam;
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    MLIRContext *context = &getContext();
+    IRRewriter rewriter(context);
+    func.walk([&](scf::WhileOp op) {
+      FailureOr<scf::WhileOp> result =
+          scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
+      // Ignore not implemented failure in tests. The expected output should
+      // catch problems (e.g. transformation doesn't happen).
+      (void)result;
+    });
+  }
+
+  Option<bool> forceCreateCheck{
+      *this, "force-create-check",
+      llvm::cl::desc("Force to create zero-trip-check."),
+      llvm::cl::init(false)};
+};
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestSCFWrapInZeroTripCheckPasses() {
+  PassRegistration<TestWrapWhileLoopInZeroTripCheckPass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 126d65b1b848..acd38980514a 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -823,6 +823,33 @@ struct TestVectorEmulateMaskedLoadStore final
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
+
+struct TestVectorLinearize final
+    : public PassWrapper<TestVectorLinearize, OperationPass<>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorLinearize)
+
+  StringRef getArgument() const override { return "test-vector-linearize"; }
+  StringRef getDescription() const override {
+    return "Linearizes ND vectors for N >= 2 into 1D vectors";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    auto *context = &getContext();
+
+    TypeConverter typeConverter;
+    RewritePatternSet patterns(context);
+    ConversionTarget target(*context);
+
+    vector::populateVectorLinearizeTypeConversionsAndLegality(typeConverter,
+                                                              patterns, target);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      return signalPassFailure();
+  }
+};
 } // namespace
 
 namespace mlir {
@@ -867,6 +894,8 @@ void registerTestVectorLowerings() {
   PassRegistration<TestFoldArithExtensionIntoVectorContractPatterns>();
 
   PassRegistration<TestVectorEmulateMaskedLoadStore>();
+
+  PassRegistration<TestVectorLinearize>();
 }
 } // namespace test
 } // namespace mlir
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 946a224dab06..1fa7030ca1be 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -28,7 +28,7 @@ def testEncodingAttr1D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [8]
+        # CHECK: lvl_types: [131072]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0) -> (d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
@@ -52,6 +52,90 @@ def testEncodingAttr1D():
         print(f"created_pos_width: {created.pos_width}")
 
 
+# CHECK-LABEL: TEST: testEncodingAttrStructure
+@run
+def testEncodingAttrStructure():
+    with Context() as ctx:
+        parsed = Attribute.parse(
+            "#sparse_tensor.encoding<{"
+            "  map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense,"
+            "  d1 mod 4 : structured[2, 4]),"
+            "  posWidth = 16,"
+            "  crdWidth = 32"
+            "}>"
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), posWidth = 16, crdWidth = 32 }>
+        print(parsed)
+
+        casted = st.EncodingAttr(parsed)
+        # CHECK: equal: True
+        print(f"equal: {casted == parsed}")
+
+        # CHECK: lvl_types: [65536, 65536, 4406637494272]
+        print(f"lvl_types: {casted.lvl_types}")
+        # CHECK: lvl_types_enum: [<LevelType.dense: 65536>, <LevelType.dense: 65536>, <LevelType.n_out_of_m: 1048576>]
+        print(f"lvl_types_enum: {casted.lvl_types_enum}")
+        # CHECK: structured_n: 2
+        print(f"structured_n: {casted.structured_n}")
+        # CHECK: structured_m: 4
+        print(f"structured_m: {casted.structured_m}")
+        # CHECK: dim_to_lvl: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)
+        print(f"dim_to_lvl: {casted.dim_to_lvl}")
+        # CHECK: lvl_to_dim: (d0, d1, d2) -> (d0, d1 * 4 + d2)
+        print(f"lvl_to_dim: {casted.lvl_to_dim}")
+        # CHECK: pos_width: 16
+        print(f"pos_width: {casted.pos_width}")
+        # CHECK: crd_width: 32
+        print(f"crd_width: {casted.crd_width}")
+
+        created = st.EncodingAttr.get(
+            casted.lvl_types, casted.dim_to_lvl, casted.lvl_to_dim, 0, 0
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }>
+        print(created)
+        # CHECK: created_equal: False
+        print(f"created_equal: {created == casted}")
+
+        built_2_4 = st.EncodingAttr.build_level_type(st.LevelType.n_out_of_m, 2, 4)
+        dim_to_lvl = AffineMap.get(
+            2,
+            0,
+            [
+                AffineExpr.get_dim(0),
+                AffineExpr.get_floor_div(AffineExpr.get_dim(1), 4),
+                AffineExpr.get_mod(AffineExpr.get_dim(1), 4),
+            ],
+        )
+        lvl_to_dim = AffineMap.get(
+            3,
+            0,
+            [
+                AffineExpr.get_dim(0),
+                AffineExpr.get_add(
+                    AffineExpr.get_mul(AffineExpr.get_dim(1), 4),
+                    AffineExpr.get_dim(2),
+                ),
+            ],
+        )
+        built = st.EncodingAttr.get(
+            [st.LevelType.dense, st.LevelType.dense, built_2_4],
+            dim_to_lvl,
+            lvl_to_dim,
+            0,
+            0,
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]) }>
+        print(built)
+        # CHECK: built_equal: True
+        print(f"built_equal: {built == created}")
+
+        # Verify that the factory creates an instance of the proper type.
+        # CHECK: is_proper_instance: True
+        print(f"is_proper_instance: {isinstance(created, st.EncodingAttr)}")
+        # CHECK: created_pos_width: 0
+        print(f"created_pos_width: {created.pos_width}")
+
+
 # CHECK-LABEL: TEST: testEncodingAttr2D
 @run
 def testEncodingAttr2D():
@@ -70,7 +154,7 @@ def testEncodingAttr2D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [4, 8]
+        # CHECK: lvl_types: [65536, 131072]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
diff --git a/mlir/test/python/dialects/transform_loop_ext.py b/mlir/test/python/dialects/transform_loop_ext.py
index 840e7a46e7ce..430b33fba04c 100644
--- a/mlir/test/python/dialects/transform_loop_ext.py
+++ b/mlir/test/python/dialects/transform_loop_ext.py
@@ -49,6 +49,24 @@ def loopPeel():
     # CHECK-LABEL: TEST: loopPeel
     # CHECK: = transform.loop.peel %
 
+@run
+def loopPeel_peel_front():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("scf.for"),
+    )
+    with InsertionPoint(sequence.body):
+        loop.LoopPeelOp(
+            transform.AnyOpType.get(),
+            transform.AnyOpType.get(),
+            sequence.bodyTarget,
+            peel_front=True,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: loopPeel_peel_front
+    # CHECK: = transform.loop.peel %[[ARG0:.*]] {peel_front = true}
+
 
 @run
 def loopPipeline():
diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py
index a5c38a6b0b07..ecafcb46af21 100644
--- a/mlir/test/python/ir/module.py
+++ b/mlir/test/python/ir/module.py
@@ -105,6 +105,10 @@ def testModuleOperation():
     assert ctx._get_live_module_count() == 1
     op1 = module.operation
     assert ctx._get_live_operation_count() == 1
+    live_ops = ctx._get_live_operation_objects()
+    assert len(live_ops) == 1
+    assert live_ops[0] is op1
+    live_ops = None
     # CHECK: module @successfulParse
     print(op1)
 
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 1b3f60b4b3ab..cec1e5225d5a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -127,6 +127,7 @@ void registerTestPadFusion();
 void registerTestRecursiveTypesPass();
 void registerTestSCFUtilsPass();
 void registerTestSCFWhileOpBuilderPass();
+void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
 void registerTestSliceAnalysisPass();
 void registerTestTensorCopyInsertionPass();
@@ -250,6 +251,7 @@ void registerTestPasses() {
   mlir::test::registerTestRecursiveTypesPass();
   mlir::test::registerTestSCFUtilsPass();
   mlir::test::registerTestSCFWhileOpBuilderPass();
+  mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
   mlir::test::registerTestShapeMappingPass();
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestTensorCopyInsertionPass();
diff --git a/mlir/unittests/IR/AffineMapTest.cpp b/mlir/unittests/IR/AffineMapTest.cpp
new file mode 100644
index 000000000000..081afadd632f
--- /dev/null
+++ b/mlir/unittests/IR/AffineMapTest.cpp
@@ -0,0 +1,23 @@
+//===- AffineMapTest.cpp - unit tests for affine map API ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+// Test AffineMap replace API for the zero result case.
+TEST(AffineMapTest, inferMapFromAffineExprs) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+  AffineMap map = b.getEmptyAffineMap();
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  map.replace(replacements);
+  EXPECT_EQ(map, map);
+}
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 1ed46869c2c8..e7e9c3b56516 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRIRTests
   AdaptorTest.cpp
+  AffineMapTest.cpp
   AttributeTest.cpp
   DialectTest.cpp
   InterfaceTest.cpp
diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index 71346201129b..4809520e1bab 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -41,12 +41,15 @@ append_if(OPENMP_HAVE_WSIGN_COMPARE_FLAG "-Wsign-compare" CMAKE_C_FLAGS CMAKE_CX
 # printed. Therefore, check for whether the compiler supports options in the
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
-append_if(OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG "-Wno-enum-constexpr-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WEXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WPEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
-append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+if (NOT (WIN32 OR CYGWIN))
+  # This flag is not relevant on Windows; the flag is accepted, but produces warnings
+  # about argument unused during compilation.
+  append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake
index df41956dadd4..ab2348ae59b5 100644
--- a/openmp/cmake/OpenMPTesting.cmake
+++ b/openmp/cmake/OpenMPTesting.cmake
@@ -10,6 +10,8 @@ function(find_standalone_test_dependencies)
     message(WARNING "The check targets will not be available!")
     set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
     return()
+  else()
+    set(Python3_EXECUTABLE ${Python3_EXECUTABLE} PARENT_SCOPE)
   endif()
 
   # Find executables.
diff --git a/openmp/cmake/config-ix.cmake b/openmp/cmake/config-ix.cmake
index a1e1b61d374b..cfc683385d56 100644
--- a/openmp/cmake/config-ix.cmake
+++ b/openmp/cmake/config-ix.cmake
@@ -33,7 +33,6 @@ check_cxx_compiler_flag(-Wsign-compare OPENMP_HAVE_WSIGN_COMPARE_FLAG)
 # printed. Therefore, check for whether the compiler supports options in the
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
-check_cxx_compiler_flag(-Wenum-constexpr-conversion OPENMP_HAVE_WENUM_CONSTEXPR_CONVERSION_FLAG)
 check_cxx_compiler_flag(-Wextra OPENMP_HAVE_WEXTRA_FLAG)
 check_cxx_compiler_flag(-Wpedantic OPENMP_HAVE_WPEDANTIC_FLAG)
 check_cxx_compiler_flag(-Wmaybe-uninitialized OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG)
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index b0ecf1213dab..ff129feb5b6f 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -152,6 +152,10 @@ if(UNIX)
   set(LIBOMP_DL_LIBS ${CMAKE_DL_LIBS})
 endif()
 
+# Disable libstdc++ assertions, even in an LLVM_ENABLE_ASSERTIONS build, to
+# avoid an unwanted dependency on libstdc++.so.
+add_definitions(-U_GLIBCXX_ASSERTIONS)
+
 # Add the OpenMP library
 libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 4486164ce0e2..0edc41d10641 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2516,15 +2516,19 @@ bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
                              MemoryAccess *StoreMA, isl::set Domain,
                              SmallVector<MemoryAccess *, 8> &MemAccs) {
   bool HasIntersectingAccs = false;
+  auto AllAccsNoParams = AllAccs.project_out_all_params();
+
   for (MemoryAccess *MA : MemAccs) {
     if (MA == LoadMA || MA == StoreMA)
       continue;
+    auto AccRel = MA->getAccessRelation().intersect_domain(Domain);
+    auto Accs = AccRel.range();
+    auto AccsNoParams = Accs.project_out_all_params();
 
-    isl::map AccRel = MA->getAccessRelation().intersect_domain(Domain);
-    isl::set Accs = AccRel.range();
+    bool CompatibleSpace = AllAccsNoParams.has_equal_space(AccsNoParams);
 
-    if (AllAccs.has_equal_space(Accs)) {
-      isl::set OverlapAccs = Accs.intersect(AllAccs);
+    if (CompatibleSpace) {
+      auto OverlapAccs = Accs.intersect(AllAccs);
       bool DoesIntersect = !OverlapAccs.is_empty();
       HasIntersectingAccs |= DoesIntersect;
     }
diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
index 364e21aef207..30fbd17c78bf 100644
--- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
+++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
@@ -122,7 +122,7 @@ void getDebugLocations(const BBPair &P, DebugLoc &Begin, DebugLoc &End) {
       continue;
     Todo.append(succ_begin(BB), succ_end(BB));
     for (const Instruction &Inst : *BB) {
-      DebugLoc DL = Inst.getDebugLoc();
+      DebugLoc DL = Inst.getStableDebugLoc();
       if (!DL)
         continue;
 
@@ -821,7 +821,7 @@ std::string ReportUnprofitable::getEndUserMessage() const {
 const DebugLoc &ReportUnprofitable::getDebugLoc() const {
   for (const BasicBlock *BB : R->blocks())
     for (const Instruction &Inst : *BB)
-      if (const DebugLoc &DL = Inst.getDebugLoc())
+      if (const DebugLoc &DL = Inst.getStableDebugLoc())
         return DL;
 
   return R->getEntry()->getTerminator()->getDebugLoc();
diff --git a/polly/lib/Support/ScopLocation.cpp b/polly/lib/Support/ScopLocation.cpp
index 01f3d68926d8..9f9941d57ad2 100644
--- a/polly/lib/Support/ScopLocation.cpp
+++ b/polly/lib/Support/ScopLocation.cpp
@@ -25,7 +25,7 @@ void getDebugLocation(const Region *R, unsigned &LineBegin, unsigned &LineEnd,
 
   for (const BasicBlock *BB : R->blocks())
     for (const Instruction &Inst : *BB) {
-      DebugLoc DL = Inst.getDebugLoc();
+      DebugLoc DL = Inst.getStableDebugLoc();
       if (!DL)
         continue;
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index 618237157e08..35986b5e0b35 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -19,20 +19,20 @@
 
 ; If we reject non-affine loops the non-affine loop bound will be reported:
 ;
-; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
+; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
 ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: Failed to derive an affine function from the loop bounds.
 ; REJECTNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 ; If we allow non-affine loops the non-affine access will be reported:
 ;
-; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
+; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
 ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: The array subscript of "A" is not affine
 ; ALLOWNONAFFINELOOPS: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 ; If we allow non-affine loops and non-affine accesses the region will be reported as not profitable:
 ;
-; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: The following errors keep this region from being a Scop.
-; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:1:12: No profitable polyhedral optimization found
+; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: The following errors keep this region from being a Scop.
+; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:2:8: No profitable polyhedral optimization found
 ; ALLOWNONAFFINEALL: remark: ReportLoopBound-01.c:3:5: Invalid Scop candidate ends here.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index 7661bd004f5c..a0f2704b1372 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -2,7 +2,7 @@
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
-; CHECK: remark: uuencode.c:75:18: The following errors keep this region from being a Scop.
+; CHECK: remark: uuencode.c:76:13: The following errors keep this region from being a Scop.
 ; CHECK: remark: uuencode.c:83:3: Loop cannot be handled because it has multiple exits.
 ; CHECK: remark: uuencode.c:95:21: Invalid Scop candidate ends here.
 
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index f2574b77d6e5..e39569abc52d 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -44,9 +44,9 @@
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_1] };
 ; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_2] };
-; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_0 + i0] };
-; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_0 + i0] };
 ; INNERMOST-NEXT:     Stmt_bb26
 ; INNERMOST-NEXT:         Domain :=
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index 463db2359608..75dd7ac26bb3 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -44,9 +44,9 @@
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_1] };
 ; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[p_2] };
-; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[i0] };
-; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: +] [Scalar: 0]
+; INNERMOST-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; INNERMOST-NEXT:             [p_0, p_1, p_2] -> { Stmt_bb16[i0] -> MemRef_A[i0] };
 ; INNERMOST-NEXT:     Stmt_bb26
 ; INNERMOST-NEXT:         Domain :=
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index 2e7bfbe5eaee..9fadc5a8eb28 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -11,9 +11,9 @@
 ;
 ; CHECK:        ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + val] };
-; CHECK-NEXT:   ReadAccess :=	[Reduction Type: +] [Scalar: 0]
+; CHECK-NEXT:   ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + ptr] };
-; CHECK-NEXT:   MustWriteAccess :=	[Reduction Type: +] [Scalar: 0]
+; CHECK-NEXT:   MustWriteAccess :=	[Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:       [val, ptr] -> { Stmt_for_body[i0] -> MemRef_A[9 + ptr] };
 ;
 ; IR:      entry:
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index c334658258f7..f4ae2c7ce456 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -2,9 +2,9 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
 
 SKYLIB_VERSION = "1.3.0"
 
@@ -12,8 +12,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -113,6 +113,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/examples/http_archive/WORKSPACE b/utils/bazel/examples/http_archive/WORKSPACE
index 82e9f165730f..efc3a083e059 100644
--- a/utils/bazel/examples/http_archive/WORKSPACE
+++ b/utils/bazel/examples/http_archive/WORKSPACE
@@ -14,8 +14,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -57,6 +57,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/examples/submodule/WORKSPACE b/utils/bazel/examples/submodule/WORKSPACE
index 34e2cfd32ae7..e8eff85ffda5 100644
--- a/utils/bazel/examples/submodule/WORKSPACE
+++ b/utils/bazel/examples/submodule/WORKSPACE
@@ -12,8 +12,8 @@ http_archive(
     name = "bazel_skylib",
     sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version=SKYLIB_VERSION),
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
     ],
 )
 
@@ -46,6 +46,6 @@ maybe(
     sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
     strip_prefix = "zstd-1.5.2",
     urls = [
-        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+        "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index 172620adc58c..043a3b61a75f 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -80,10 +80,10 @@ cc_library(
         "//llvm:Analysis",
         "//llvm:BinaryFormat",
         "//llvm:CodeGen",
+        "//llvm:DWARFLinker",
         "//llvm:DWP",
         "//llvm:DebugInfoDWARF",
         "//llvm:Demangle",
-        "//llvm:DWARFLinker",
         "//llvm:JITLink",
         "//llvm:MC",
         "//llvm:MCDisassembler",
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index dda6d9460d58..b5de786adaaf 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -3,10 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("//:workspace_root.bzl", "workspace_root")
-load("//llvm:tblgen.bzl", "gentbl")
-load("//llvm:binary_alias.bzl", "binary_alias")
-load("//llvm:cc_plugin_library.bzl", "cc_plugin_library")
 load(
     "//:vars.bzl",
     "LLVM_VERSION",
@@ -14,6 +10,10 @@ load(
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
 )
+load("//:workspace_root.bzl", "workspace_root")
+load("//llvm:binary_alias.bzl", "binary_alias")
+load("//llvm:cc_plugin_library.bzl", "cc_plugin_library")
+load("//llvm:tblgen.bzl", "gentbl")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -321,6 +321,20 @@ gentbl(
 )
 
 gentbl(
+    name = "basic_builtins_riscv_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsRISCV.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsRISCV.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsRISCV.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
     name = "basic_builtins_gen",
     tbl_outs = [(
         "-gen-clang-builtins",
@@ -656,6 +670,7 @@ cc_library(
         ":basic_attr_gen",
         ":basic_builtins_bpf_gen",
         ":basic_builtins_gen",
+        ":basic_builtins_riscv_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
@@ -2417,8 +2432,8 @@ cc_binary(
 cc_binary(
     name = "clang-format",
     srcs = [
-        "tools/clang-format/ClangFormat.cpp",
         "lib/Format/MatchFilePath.h",
+        "tools/clang-format/ClangFormat.cpp",
     ],
     stamp = 0,
     deps = [
diff --git a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
index bba18bfd387a..9bdd454e1e36 100644
--- a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel
@@ -36,7 +36,7 @@ cc_library(
         exclude = WIN32_ONLY_FILES,
     ) + select({
         "@platforms//os:windows": WIN32_ONLY_FILES,
-        "//conditions:default": []
+        "//conditions:default": [],
     }),
     hdrs = glob([
         "include/profile/*.h",
@@ -45,8 +45,8 @@ cc_library(
     includes = [
         "include",
     ],
+    linkstatic = True,
     deps = [
         ":config",
     ],
-    linkstatic = True,
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index fd6d1dc047f4..30c180b5a8b4 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2803,6 +2803,7 @@ libc_function(
     name = "read",
     srcs = ["src/unistd/linux/read.cpp"],
     hdrs = ["src/unistd/read.h"],
+    weak = True,
     deps = [
         ":__support_common",
         ":__support_macros_sanitizer",
@@ -2928,6 +2929,7 @@ libc_function(
     name = "write",
     srcs = ["src/unistd/linux/write.cpp"],
     hdrs = ["src/unistd/write.h"],
+    weak = True,
     deps = [
         ":__support_common",
         ":__support_osutil_syscall",
@@ -3216,6 +3218,7 @@ libc_function(
     name = "epoll_wait",
     srcs = ["src/sys/epoll/linux/epoll_wait.cpp"],
     hdrs = ["src/sys/epoll/epoll_wait.h"],
+    weak = True,
     deps = [
         ":__support_osutil_syscall",
         ":errno",
@@ -3226,6 +3229,7 @@ libc_function(
     name = "epoll_pwait",
     srcs = ["src/sys/epoll/linux/epoll_pwait.cpp"],
     hdrs = ["src/sys/epoll/epoll_pwait.h"],
+    weak = True,
     deps = [
         ":__support_osutil_syscall",
         ":errno",
@@ -3238,6 +3242,7 @@ libc_function(
 #     name = "epoll_pwait2",
 #     srcs = ["src/sys/epoll/linux/epoll_pwait2.cpp"],
 #     hdrs = ["src/sys/epoll/epoll_pwait2.h"],
+#     weak = True,
 #     deps = [
 #         ":__support_osutil_syscall",
 #         ":errno",
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index a28fa51a39f9..17eb30c8e458 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -82,10 +82,10 @@ def libc_function(
     # original list, where this creates a new list and stores it in deps.
     copts = copts or []
     copts = copts + [
-      "-O3",
-      "-fno-builtin",
-      "-fno-lax-vector-conversions",
-      "-ftrivial-auto-var-init=pattern"
+        "-O3",
+        "-fno-builtin",
+        "-fno-lax-vector-conversions",
+        "-ftrivial-auto-var-init=pattern",
     ]
 
     # We compile the code twice, the first target is suffixed with ".__internal__" and contains the
diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
index 3ebae21f585a..803010e8b3ad 100644
--- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
@@ -46,8 +46,8 @@ libc_support_library(
         "//libc:__support_cpp_type_traits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
-        "//libc/test/UnitTest:fp_test_helpers",
         "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_impl",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index a01aa13ed4f9..8fb71fc1f971 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -3,12 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("//llvm:tblgen.bzl", "gentbl")
-load("//llvm:binary_alias.bzl", "binary_alias")
 load(
     "//:vars.bzl",
     "LLVM_VERSION",
 )
+load("//llvm:binary_alias.bzl", "binary_alias")
+load("//llvm:tblgen.bzl", "gentbl")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index f720c1856a29..4802daa66286 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -772,6 +772,10 @@ llvm_target_intrinsics_list = [
         "intrinsic_prefix": "s390",
     },
     {
+        "name": "SPIRV",
+        "intrinsic_prefix": "spv",
+    },
+    {
         "name": "VE",
         "intrinsic_prefix": "ve",
     },
@@ -4856,7 +4860,7 @@ cc_library(
 cc_binary(
     name = "opt",
     stamp = 0,
-    deps = [":opt-driver"]
+    deps = [":opt-driver"],
 )
 
 gentbl(
@@ -4975,9 +4979,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/extract_ir.py",
-        "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 py_binary(
@@ -4985,9 +4989,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py",
-        "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 py_binary(
@@ -4995,9 +4999,9 @@ py_binary(
     srcs = [
         "utils/mlgo-utils/mlgo/__init__.py",
         "utils/mlgo-utils/mlgo/corpus/make_corpus.py",
-        "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py"
+        "utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py",
     ],
-    imports = ["utils/mlgo-utils"]
+    imports = ["utils/mlgo-utils"],
 )
 
 cc_library(
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e9e7fe3b35b7..821481ee272a 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4552,8 +4552,8 @@ cc_library(
     deps = [
         ":ControlFlowDialect",
         ":FuncDialect",
-        ":InferTypeOpInterface",
         ":IR",
+        ":InferTypeOpInterface",
         ":MeshShardingInterface",
     ],
 )
@@ -8756,7 +8756,7 @@ cc_library(
         ":ROCDLTarget",
         ":ROCDLToLLVMIRTranslation",
         ":SPIRVToLLVMIRTranslation",
-	":VCIXToLLVMIRTranslation",
+        ":VCIXToLLVMIRTranslation",
         ":X86VectorToLLVMIRTranslation",
     ],
 )
@@ -9155,6 +9155,7 @@ cc_binary(
         "//mlir/test:TestLinalg",
         "//mlir/test:TestLoopLikeInterface",
         "//mlir/test:TestMath",
+        "//mlir/test:TestMathToVCIX",
         "//mlir/test:TestMemRef",
         "//mlir/test:TestMesh",
         "//mlir/test:TestNVGPU",
@@ -9175,7 +9176,6 @@ cc_binary(
         "//mlir/test:TestTypeDialect",
         "//mlir/test:TestVector",
         "//mlir/test:TestVectorToSPIRV",
-        "//mlir/test:TestMathToVCIX",
     ],
 )
 
@@ -10036,6 +10036,7 @@ cc_library(
         ":OpenMPInterfacesIncGen",
         ":OpenMPOpsIncGen",
         ":OpenMPTypeInterfacesIncGen",
+        ":Support",
         "//llvm:FrontendOpenMP",
         "//llvm:Support",
     ],
@@ -13657,11 +13658,11 @@ cc_library(
     hdrs = ["include/mlir/Dialect/LLVMIR/VCIXDialect.h"],
     includes = ["include"],
     deps = [
-	":GPUDialect",
+        ":GPUDialect",
         ":IR",
         ":LLVMDialect",
-        ":VCIXOpsIncGen",
         ":SideEffectInterfaces",
+        ":VCIXOpsIncGen",
         "//llvm:AsmParser",
         "//llvm:Core",
         "//llvm:Support",
@@ -13689,7 +13690,7 @@ gentbl_cc_library(
             ["-gen-op-defs"],
             "include/mlir/Dialect/LLVMIR/VCIXOps.cpp.inc",
         ),
-	(
+        (
             [
                 "-gen-dialect-decls",
                 "-dialect=vcix",
@@ -13730,9 +13731,9 @@ cc_library(
     includes = ["include"],
     deps = [
         ":IR",
+        ":ToLLVMIRTranslation",
         ":VCIXConversionIncGen",
         ":VCIXDialect",
-        ":ToLLVMIRTranslation",
         "//llvm:Core",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index c650e3e368d6..c3bc3f196c55 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -804,8 +804,6 @@ cc_library(
     ],
 )
 
-
-
 cc_library(
     name = "TestMemRef",
     srcs = glob(["lib/Dialect/MemRef/*.cpp"]),
diff --git a/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel b/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
index 86d502ad326c..0b5c084086ed 100644
--- a/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/third-party/unittest/BUILD.bazel
@@ -52,9 +52,9 @@ cc_library(
     ) + [
     ],
     hdrs = [
-        "googletest/include/gtest/internal/gtest-port.h",
         "googletest/include/gtest/gtest.h",
         "googletest/include/gtest/gtest-spi.h",
+        "googletest/include/gtest/internal/gtest-port.h",
     ],
     copts = llvm_copts,
     defines = [
diff --git a/utils/bazel/third_party_build/zlib-ng.BUILD b/utils/bazel/third_party_build/zlib-ng.BUILD
index c41ea450fb14..055261acb0f6 100644
--- a/utils/bazel/third_party_build/zlib-ng.BUILD
+++ b/utils/bazel/third_party_build/zlib-ng.BUILD
@@ -32,49 +32,49 @@ cc_library(
     name = "zlib",
     srcs = select({
         ":llvm_zlib_enabled": [
+            "adler32.c",
             "adler32_p.h",
+            "chunkset.c",
             "chunkset_tpl.h",
+            "compare258.c",
+            "compress.c",
+            "crc32.c",
+            "crc32_comb.c",
+            "crc32_comb_tbl.h",
             "crc32_p.h",
             "crc32_tbl.h",
-            "crc32_comb_tbl.h",
+            "deflate.c",
             "deflate.h",
+            "deflate_fast.c",
+            "deflate_medium.c",
             "deflate_p.h",
-            "functable.h",
+            "deflate_quick.c",
+            "deflate_slow.c",
             "fallback_builtins.h",
+            "functable.c",
+            "functable.h",
+            "infback.c",
+            "inffast.c",
             "inffast.h",
             "inffixed_tbl.h",
+            "inflate.c",
             "inflate.h",
             "inflate_p.h",
+            "inftrees.c",
             "inftrees.h",
+            "insert_string.c",
             "insert_string_tpl.h",
             "match_tpl.h",
+            "trees.c",
             "trees.h",
             "trees_emit.h",
             "trees_tbl.h",
+            "uncompr.c",
             "zbuild.h",
             "zendian.h",
+            "zutil.c",
             "zutil.h",
-            "adler32.c",
-            "chunkset.c",
-            "compare258.c",
-            "compress.c",
-            "crc32.c",
-            "crc32_comb.c",
-            "deflate.c",
-            "deflate_fast.c",
-            "deflate_medium.c",
-            "deflate_quick.c",
-            "deflate_slow.c",
-            "functable.c",
-            "infback.c",
-            "inffast.c",
-            "inflate.c",
-            "inftrees.c",
-            "insert_string.c",
-            "trees.c",
-            "uncompr.c",
             "zutil_p.h",
-            "zutil.c",
         ],
         "//conditions:default": [],
     }),
diff --git a/utils/bazel/third_party_build/zstd.BUILD b/utils/bazel/third_party_build/zstd.BUILD
index 510c461390d8..7d022d4226de 100644
--- a/utils/bazel/third_party_build/zstd.BUILD
+++ b/utils/bazel/third_party_build/zstd.BUILD
@@ -37,8 +37,8 @@ cc_library(
     }),
     hdrs = select({
         ":llvm_zstd_enabled": [
-            "lib/zstd.h",
             "lib/zdict.h",
+            "lib/zstd.h",
             "lib/zstd_errors.h",
         ],
         "//conditions:default": [],
author	mingmingl <mingmingl@google.com>	2024-02-13 11:11:15 -0800
committer	mingmingl <mingmingl@google.com>	2024-02-13 11:11:15 -0800
commit	5de40c7ae250cf47a36f05d32e014a2bd9adfeef (patch)
tree	9afc3edb9616399c97721d96c6cab702575a5f35
parent	3f2da71253c4ea2ca75c4aaf9d659b11aab12c44 (diff)
parent	fc0e9c8315564288f9079a633892abadace534cf (diff)